Add data visualization for Anthropic

topoteretes · Vasilije1990 · Jan 16, 2025 · Jan 10, 2025 · Jan 11, 2025 · Jan 11, 2025
commit ad07bae9a781c909c67658261d579de5834ea3b1
diff --git a/cognee/shared/utils.py b/cognee/shared/utils.py
@@ -11,7 +11,7 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 import tiktoken
-
+import nltk
 import base64
 import time
 
@@ -30,6 +30,34 @@
 proxy_url = "https://test.prometh.ai"
 
 
+
+def get_entities(tagged_tokens):
+    nltk.download("maxent_ne_chunker", quiet=True)
+    from nltk.chunk import ne_chunk
+
+    return ne_chunk(tagged_tokens)
-def get_entities(tagged_tokens):
-    nltk.download("maxent_ne_chunker", quiet=True)
-    from nltk.chunk import ne_chunk
-
-    return ne_chunk(tagged_tokens)
+def get_entities(tagged_tokens):
+    """Extract named entities from POS-tagged tokens using NLTK's ne_chunk.
+    
+    Args:
+        tagged_tokens: A list of POS-tagged tokens from nltk.pos_tag()
+    
+    Returns:
+        A tree containing chunks of named entities
+    """
+    try:
+        nltk.download("maxent_ne_chunker", quiet=True)
+        from nltk.chunk import ne_chunk
+    except Exception as e:
+        logging.error(f"Failed to download NLTK resources: {str(e)}")
+        raise
+
+    return ne_chunk(tagged_tokens)
-def get_entities(tagged_tokens):
-    nltk.download("maxent_ne_chunker", quiet=True)
-    from nltk.chunk import ne_chunk
-
-    return ne_chunk(tagged_tokens)
+def get_entities(tagged_tokens):
+    """Extract named entities from POS-tagged tokens using NLTK's ne_chunk.
+    
+    Args:
+        tagged_tokens: A list of POS-tagged tokens from nltk.pos_tag()
+    
+    Returns:
+        A tree containing chunks of named entities
+    """
+    try:
+        nltk.download("maxent_ne_chunker", quiet=True)
+        from nltk.chunk import ne_chunk
+    except Exception as e:
+        logging.error(f"Failed to download NLTK resources: {str(e)}")
+        raise
+
+    return ne_chunk(tagged_tokens)
+
+
+def extract_pos_tags(sentence):
+    """Extract Part-of-Speech (POS) tags for words in a sentence."""
+
+    # Ensure that the necessary NLTK resources are downloaded
+    nltk.download("words", quiet=True)
+    nltk.download("punkt", quiet=True)
+    nltk.download("averaged_perceptron_tagger", quiet=True)
+
+    from nltk.tag import pos_tag
+    from nltk.tokenize import word_tokenize
+
+    # Tokenize the sentence into words
+    tokens = word_tokenize(sentence)
+
+    # Tag each word with its corresponding POS tag
+    pos_tags = pos_tag(tokens)
+
+    return pos_tags
+
-def extract_pos_tags(sentence):
-    """Extract Part-of-Speech (POS) tags for words in a sentence."""
-
-    # Ensure that the necessary NLTK resources are downloaded
-    nltk.download("words", quiet=True)
-    nltk.download("punkt", quiet=True)
-    nltk.download("averaged_perceptron_tagger", quiet=True)
-
-    from nltk.tag import pos_tag
-    from nltk.tokenize import word_tokenize
-
-    # Tokenize the sentence into words
-    tokens = word_tokenize(sentence)
-
-    # Tag each word with its corresponding POS tag
-    pos_tags = pos_tag(tokens)
-
-    return pos_tags
+def extract_pos_tags(sentence):
+    """Extract Part-of-Speech (POS) tags for words in a sentence.
+    
+    Args:
+        sentence (str): Input sentence to be POS tagged
+    
+    Returns:
+        list: A list of tuples containing (word, POS_tag)
+    
+    Raises:
+        ValueError: If sentence is not a string or is empty
+        Exception: If NLTK resource download fails
+    """
+    if not isinstance(sentence, str) or not sentence.strip():
+        raise ValueError("Input must be a non-empty string")
+
+    try:
+        # Ensure that the necessary NLTK resources are downloaded
+        nltk.download("words", quiet=True)
+        nltk.download("punkt", quiet=True)
+        nltk.download("averaged_perceptron_tagger", quiet=True)
+    except Exception as e:
+        logging.error(f"Failed to download NLTK resources: {str(e)}")
+        raise
+
+    from nltk.tag import pos_tag
+    from nltk.tokenize import word_tokenize
+
+    # Tokenize the sentence into words
+    tokens = word_tokenize(sentence)
+
+    # Tag each word with its corresponding POS tag
+    pos_tags = pos_tag(tokens)
+
+    return pos_tags
-def extract_pos_tags(sentence):
-    """Extract Part-of-Speech (POS) tags for words in a sentence."""
-
-    # Ensure that the necessary NLTK resources are downloaded
-    nltk.download("words", quiet=True)
-    nltk.download("punkt", quiet=True)
-    nltk.download("averaged_perceptron_tagger", quiet=True)
-
-    from nltk.tag import pos_tag
-    from nltk.tokenize import word_tokenize
-
-    # Tokenize the sentence into words
-    tokens = word_tokenize(sentence)
-
-    # Tag each word with its corresponding POS tag
-    pos_tags = pos_tag(tokens)
-
-    return pos_tags
+def extract_pos_tags(sentence):
+    """Extract Part-of-Speech (POS) tags for words in a sentence.
+    
+    Args:
+        sentence (str): Input sentence to be POS tagged
+    
+    Returns:
+        list: A list of tuples containing (word, POS_tag)
+    
+    Raises:
+        ValueError: If sentence is not a string or is empty
+        Exception: If NLTK resource download fails
+    """
+    if not isinstance(sentence, str) or not sentence.strip():
+        raise ValueError("Input must be a non-empty string")
+
+    try:
+        # Ensure that the necessary NLTK resources are downloaded
+        nltk.download("words", quiet=True)
+        nltk.download("punkt", quiet=True)
+        nltk.download("averaged_perceptron_tagger", quiet=True)
+    except Exception as e:
+        logging.error(f"Failed to download NLTK resources: {str(e)}")
+        raise
+
+    from nltk.tag import pos_tag
+    from nltk.tokenize import word_tokenize
+
+    # Tokenize the sentence into words
+    tokens = word_tokenize(sentence)
+
+    # Tag each word with its corresponding POS tag
+    pos_tags = pos_tag(tokens)
+
+    return pos_tags
+
 def get_anonymous_id():
     """Creates or reads a anonymous user id"""
     home_dir = str(pathlib.Path(pathlib.Path(__file__).parent.parent.parent.resolve()))