Skip to content

Commit ea035d9

Browse files
committed
Address CodeRabbit review feedback
- Fix Pydantic model constraints with Field validators - Use timezone-aware UTC timestamps - Fix DataPoint import path to stable module location - Update OpenAI provider for both new and legacy API compatibility - Add request timeouts for OpenAI calls - Fix NoOp provider confidence score (0.0 instead of 1.0) - Normalize provider names and simplify control flow - Use model_dump() for metadata serialization compatibility - Only attach translation metadata when content actually changes - Improve error logging with content_id context
1 parent 8feab1d commit ea035d9

File tree

2 files changed

+74
-37
lines changed

2 files changed

+74
-37
lines changed

cognee/tasks/translation/models.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
from __future__ import annotations
22

3-
from datetime import datetime
3+
from datetime import datetime, timezone
44
from typing import Dict
55
from pydantic import Field
66

7-
from cognee.infrastructure.engine import DataPoint
7+
from cognee.infrastructure.engine.models import DataPoint
88

99

1010
class TranslatedContent(DataPoint):
@@ -21,8 +21,8 @@ class TranslatedContent(DataPoint):
2121
source_language: str
2222
target_language: str = "en"
2323
translation_provider: str = "noop"
24-
confidence_score: float = 0.0
25-
translation_timestamp: datetime = Field(default_factory=datetime.utcnow)
24+
confidence_score: float = Field(0.0, ge=0.0, le=1.0)
25+
translation_timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
2626
metadata: Dict = Field(default_factory=lambda: {"index_fields": ["source_language", "original_chunk_id"]})
2727

2828

@@ -34,7 +34,7 @@ class LanguageMetadata(DataPoint):
3434
"""
3535
content_id: str
3636
detected_language: str
37-
language_confidence: float = 0.0
37+
language_confidence: float = Field(0.0, ge=0.0, le=1.0)
3838
requires_translation: bool = False
39-
character_count: int = 0
39+
character_count: int = Field(0, ge=0)
4040
metadata: Dict = Field(default_factory=lambda: {"index_fields": ["detected_language", "content_id"]})

cognee/tasks/translation/translate_content.py

Lines changed: 68 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ async def detect_language(self, text: str) -> Tuple[str, float]:
3030
except UnicodeEncodeError:
3131
return "unknown", 0.4
3232

33-
async def translate(self, text: str, target_language: str) -> Tuple[str, float]:
33+
async def translate(self, text: str, _target_language: str) -> Tuple[str, float]:
3434
# No translation performed
35-
return text, 1.0
35+
return text, 0.0
3636

3737

3838
try:
@@ -58,22 +58,42 @@ async def detect_language(self, text: str) -> Tuple[str, float]:
5858
import openai
5959

6060
class OpenAIProvider:
61-
def __init__(self):
61+
def __init__(self, model: Optional[str] = None, timeout: float = 30.0):
62+
# Prefer modern client; fall back to legacy globals.
63+
self.model = model or os.getenv("OPENAI_TRANSLATE_MODEL", "gpt-4o-mini")
64+
self.timeout = float(os.getenv("OPENAI_TIMEOUT", timeout))
6265
key = os.getenv("OPENAI_API_KEY")
63-
if key:
66+
# If the new client exists, use it; otherwise fall back to global api_key
67+
self._client = getattr(openai, "OpenAI", None)
68+
if self._client:
69+
# instantiate client with key if provided
70+
self._client = self._client(api_key=key) if key else self._client()
71+
elif key and hasattr(openai, "api_key"):
6472
openai.api_key = key
6573

6674
async def detect_language(self, text: str) -> Tuple[str, float]:
6775
try:
68-
resp = await asyncio.to_thread(
69-
openai.ChatCompletion.create,
70-
model="gpt-3.5-turbo",
71-
messages=[
72-
{"role": "system", "content": "You are a language detection assistant."},
73-
{"role": "user", "content": f"What language is this? Reply with 'lang: <code>' and 'confidence: <0-1>'\nText:\n{text[:1000]}"},
74-
],
75-
max_tokens=20,
76-
)
76+
if self._client:
77+
resp = await asyncio.to_thread(
78+
self._client.chat.completions.create,
79+
model=self.model,
80+
messages=[
81+
{"role": "system", "content": "You are a language detection assistant."},
82+
{"role": "user", "content": f"What language is this? Reply with 'lang: <code>' and 'confidence: <0-1>'\nText:\n{text[:1000]}"},
83+
],
84+
timeout=self.timeout,
85+
)
86+
else:
87+
resp = await asyncio.to_thread(
88+
openai.ChatCompletion.create,
89+
model=self.model,
90+
messages=[
91+
{"role": "system", "content": "You are a language detection assistant."},
92+
{"role": "user", "content": f"What language is this? Reply with 'lang: <code>' and 'confidence: <0-1>'\nText:\n{text[:1000]}"},
93+
],
94+
max_tokens=20,
95+
request_timeout=self.timeout,
96+
)
7797
out = resp.choices[0].message.content or ""
7898
# naive parse
7999
lang = "unknown"
@@ -92,15 +112,27 @@ async def detect_language(self, text: str) -> Tuple[str, float]:
92112

93113
async def translate(self, text: str, target_language: str) -> Tuple[str, float]:
94114
try:
95-
resp = await asyncio.to_thread(
96-
openai.ChatCompletion.create,
97-
model="gpt-3.5-turbo",
98-
messages=[
99-
{"role": "system", "content": "You are a helpful translator. Translate the user text to the target language exactly and nothing else."},
100-
{"role": "user", "content": f"Translate to {target_language}:\n\n{text[:3000]}"},
101-
],
102-
max_tokens=1000,
103-
)
115+
if self._client:
116+
resp = await asyncio.to_thread(
117+
self._client.chat.completions.create,
118+
model=self.model,
119+
messages=[
120+
{"role": "system", "content": "You are a helpful translator. Translate the user text to the target language exactly and nothing else."},
121+
{"role": "user", "content": f"Translate to {target_language}:\n\n{text[:3000]}"},
122+
],
123+
timeout=self.timeout,
124+
)
125+
else:
126+
resp = await asyncio.to_thread(
127+
openai.ChatCompletion.create,
128+
model=self.model,
129+
messages=[
130+
{"role": "system", "content": "You are a helpful translator. Translate the user text to the target language exactly and nothing else."},
131+
{"role": "user", "content": f"Translate to {target_language}:\n\n{text[:3000]}"},
132+
],
133+
max_tokens=1000,
134+
request_timeout=self.timeout,
135+
)
104136
translated = (resp.choices[0].message.content or "").strip()
105137
return translated, 0.9
106138
except Exception:
@@ -112,12 +144,12 @@ async def translate(self, text: str, target_language: str) -> Tuple[str, float]:
112144

113145
def _get_provider(name: str) -> TranslationProvider:
114146
"""Get translation provider by name."""
147+
name = (name or "noop").lower()
115148
if name == "openai" and OpenAIProvider is not None:
116149
return OpenAIProvider()
117-
elif name == "langdetect":
150+
if name == "langdetect":
118151
return LangDetectProvider()
119-
else:
120-
return NoOpProvider()
152+
return NoOpProvider()
121153

122154

123155
async def translate_content(
@@ -148,7 +180,7 @@ async def translate_content(
148180
try:
149181
lang, conf = await provider.detect_language(text)
150182
except Exception:
151-
logger.exception("language detection failed")
183+
logger.exception("language detection failed for content_id=%s", content_id)
152184
lang, conf = "unknown", 0.0
153185

154186
requires_translation = (lang != target_language) and (conf >= confidence_threshold)
@@ -162,7 +194,7 @@ async def translate_content(
162194

163195
# attach language metadata to chunk.metadata
164196
chunk.metadata = getattr(chunk, "metadata", {}) or {}
165-
chunk.metadata["language"] = lang_meta
197+
chunk.metadata["language"] = lang_meta.model_dump()
166198

167199
# perform translation when necessary
168200
if requires_translation:
@@ -181,10 +213,15 @@ async def translate_content(
181213
translation_provider=translation_provider,
182214
confidence_score=t_conf,
183215
)
184-
chunk.metadata["translation"] = trans
185-
186-
# Use translated content for subsequent tasks
187-
chunk.text = translated_text
216+
if translated_text != text:
217+
chunk.metadata["translation"] = trans.model_dump()
218+
# Use translated content for subsequent tasks
219+
chunk.text = translated_text
220+
else:
221+
logger.info(
222+
"Skipping translation metadata; provider returned unchanged text for content_id=%s",
223+
content_id,
224+
)
188225

189226
enhanced.append(chunk)
190227

0 commit comments

Comments
 (0)