Merge pull request #22 from Mascinissa/main

Fix Chunked Response Formatting
2025-09-06 05:12:14 +00:00 · 2025-02-11 17:46:36 +01:00
parent 79d7c116f5 6b80d9db78
commit 407d7dd3c4
1 changed files with 5 additions and 7 deletions
--- a/ollama_proxy_server/main.py
+++ b/ollama_proxy_server/main.py
@@ -75,15 +75,13 @@ def main():
            for key, value in response.headers.items():
                if key.lower() not in ['content-length', 'transfer-encoding', 'content-encoding']:
                    self.send_header(key, value)
-            self.send_header('Transfer-Encoding', 'chunked')
            self.end_headers()

            try:
-                for chunk in response.iter_content(chunk_size=1024):
-                    if chunk:
-                        self.wfile.write(b"%X\r\n%s\r\n" % (len(chunk), chunk))
-                        self.wfile.flush()
-                self.wfile.write(b"0\r\n\r\n")
+                # Read the full content to avoid chunking issues
+                content = response.content
+                self.wfile.write(content)
+                self.wfile.flush()
            except BrokenPipeError:
                pass

@@ -154,7 +152,7 @@ def main():
                    min_queued_server = server

            # Apply the queuing mechanism only for a specific endpoint.
-            if path == '/api/generate' or path == '/api/chat':
+            if path == '/api/generate' or path == '/api/chat' or path == '/v1/chat/completions':
                que = min_queued_server[1]['queue']
                client_ip, client_port = self.client_address
                self.add_access_log_entry(event="gen_request", user=self.user, ip_address=client_ip, access="Authorized", server=min_queued_server[0], nb_queued_requests_on_server=que.qsize())