Skip to content
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions nomic/atlas.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pyarrow import Table
from tqdm import tqdm

from .data_inference import NomicDuplicatesOptions, NomicEmbedOptions, NomicProjectOptions, NomicTopicOptions
from .data_inference import NomicDuplicatesOptions, NomicEmbedOptions, NomicTopicOptions, ProjectionOptions
from .dataset import AtlasDataset, AtlasDataStream
from .settings import *
from .utils import arrow_iterator, b64int, get_random_name
Expand All @@ -29,7 +29,7 @@ def map_data(
id_field: Optional[str] = None,
is_public: bool = True,
indexed_field: Optional[str] = None,
projection: Union[bool, Dict, NomicProjectOptions] = True,
projection: Optional[Union[Dict, ProjectionOptions]] = None,
topic_model: Union[bool, Dict, NomicTopicOptions] = True,
duplicate_detection: Union[bool, Dict, NomicDuplicatesOptions] = True,
embedding_model: Optional[Union[str, Dict, NomicEmbedOptions]] = None,
Expand All @@ -45,7 +45,7 @@ def map_data(
id_field: Specify your data unique id field. This field can be up 36 characters in length. If not specified, one will be created for you named `id_`.
is_public: Should the dataset be accessible outside your Nomic Atlas organization.
indexed_field: The text field from the dataset that will be used to create embeddings, which determines the layout of the data map in Atlas. Required for text data but won't have an impact if uploading embeddings or image blobs.
projection: Options to adjust Nomic Project - the dimensionality algorithm organizing your dataset.
projection: Options for configuring the 2D projection algorithm.
topic_model: Options to adjust Nomic Topic - the topic model organizing your dataset.
duplicate_detection: Options to adjust Nomic Duplicates - the duplicate detection algorithm.
embedding_model: Options to adjust the embedding model used to embed your dataset.
Expand Down Expand Up @@ -166,7 +166,7 @@ def map_data(
name=index_name,
indexed_field=indexed_field,
modality=modality,
projection=projection,
projection=projection, # type: ignore[arg-type]
topic_model=topic_model,
duplicate_detection=duplicate_detection,
embedding_model=embedding_model,
Expand Down
39 changes: 25 additions & 14 deletions nomic/data_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,25 +57,36 @@ def convert_pyarrow_schema_for_atlas(schema: pa.Schema) -> pa.Schema:
return pa.schema({**usertypes, **whitelist})


class NomicProjectOptions(BaseModel):
class ProjectionOptions(BaseModel):
"""
Options for Nomic 2D Dimensionality Reduction Model
Generic options for 2D Dimensionality Reduction

Args:
n_neighbors: The number of neighbors to use when approximating the high dimensional embedding space during reduction. Default: `None` (Auto-inferred).
n_epochs: How many dataset passes to train the projection model. Default: `None` (Auto-inferred).
model: The model to use when generating the 2D projected embedding space layout. Possible values: `None` or `nomic-project-v1` or `nomic-project-v2`. Default: `None`.
local_neighborhood_size: Only used when model is `nomic-project-v2`. Controls the size of the neighborhood used in the local structure optimizing step of `nomic-project-v2` algorithm. Min value: `max(n_neighbors, 1)`; max value: `128`. Default: `None` (Auto-inferred).
spread: Determines how tight together points appear. Larger values result a more spread out point layout. Min value: `0`. It is recommended leaving this value as the default `None` (Auto-inferred).
rho: Only used when model is nomic-project-v2. Controls the spread in the local structure optimizing step of `nomic-project-v2`. Min value: `0`; max value: `1`. It is recommended to leave this value as the default `None` (Auto-inferred).
model: The projection model to use.
n_neighbors: The number of neighbors to use for the projection algorithm.
n_epochs: How many dataset passes to train the projection model.
min_dist: Controls how tightly points are packed together.
spread: Nomic Project specific: Determines how tight together points appear.
local_neighborhood_size: Nomic Project v2 specific: Controls the local neighborhood size.
rho: Nomic Project v2 specific: Controls the spread in local structure.
"""

n_neighbors: Optional[int] = None
n_epochs: Optional[int] = None
spread: Optional[float] = None
local_neighborhood_size: Optional[int] = None
model: Optional[str] = None
rho: Optional[float] = None
model: Optional[str] = Field(
default=None,
description="Projection model to use (e.g., 'umap', 'nomic-project-v1', 'nomic-project-v2').",
)
n_neighbors: Optional[int] = Field(default=None, description="Number of neighbors for the projection algorithm.")
n_epochs: Optional[int] = Field(default=None, description="Number of epochs for training the projection model.")
min_dist: Optional[float] = Field(default=None, description="Minimum distance between points.")
spread: Optional[float] = Field(default=None, description="Nomic Project specific: Spread of the point layout.")
local_neighborhood_size: Optional[int] = Field(
default=None,
description="Nomic Project v2 specific: Local neighborhood size. Only used when model is 'nomic-project-v2'.",
)
rho: Optional[float] = Field(
default=None,
description="Nomic Project v2 specific: Rho parameter. Only used when model is 'nomic-project-v2'.",
)
Comment thread
mcembalest marked this conversation as resolved.


class NomicTopicOptions(BaseModel):
Expand Down
66 changes: 31 additions & 35 deletions nomic/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
from .data_inference import (
NomicDuplicatesOptions,
NomicEmbedOptions,
NomicProjectOptions,
NomicTopicOptions,
ProjectionOptions,
convert_pyarrow_schema_for_atlas,
)
from .data_operations import AtlasMapData, AtlasMapDuplicates, AtlasMapEmbeddings, AtlasMapTags, AtlasMapTopics
Expand Down Expand Up @@ -1049,7 +1049,7 @@ def get_map(
for projection in index.projections:
if projection.id == projection_id:
return projection
raise ValueError(f"Could not find a map with projection_id='{atlas_index_id}'")
raise ValueError(f"Could not find a map with projection_id='{projection_id}'")

if len(indices) == 0:
raise ValueError("You have no maps built in your project")
Expand All @@ -1071,7 +1071,7 @@ def create_index(
name: Optional[str] = None,
indexed_field: Optional[str] = None,
modality: Optional[str] = None,
projection: Union[bool, Dict, NomicProjectOptions] = True,
projection: Union[Dict, ProjectionOptions, None] = None,
topic_model: Union[bool, Dict, NomicTopicOptions] = True,
duplicate_detection: Union[bool, Dict, NomicDuplicatesOptions] = True,
embedding_model: Optional[Union[str, Dict, NomicEmbedOptions]] = None,
Expand All @@ -1085,7 +1085,7 @@ def create_index(
indexed_field: For text datasets, name the data field corresponding to the text to be mapped.
reuse_embeddings_from_index: the name of the index to reuse embeddings from.
modality: The data modality of this index. Currently, Atlas supports either `text`, `image`, or `embedding` indices.
projection: Options for configuring the 2D projection algorithm
projection: Options for configuring the 2D projection algorithm or None to let cloud decide
topic_model: Options for configuring the topic model
duplicate_detection: Options for configuring semantic duplicate detection
embedding_model: Options for configuring the embedding model
Expand All @@ -1097,10 +1097,16 @@ def create_index(

self._latest_dataset_state()

if isinstance(projection, Dict):
projection = NomicProjectOptions(**projection)
else:
projection = NomicProjectOptions()
projection_options: Optional[ProjectionOptions] = None

if isinstance(projection, ProjectionOptions):
projection_options = projection
elif isinstance(projection, dict):
projection_options = ProjectionOptions(**projection)

projection_hyperparameters: dict = {}
if projection_options is not None:
Comment thread
mcembalest marked this conversation as resolved.
projection_hyperparameters = projection_options.model_dump()

topic_model_was_false = topic_model is False
if isinstance(topic_model, Dict):
Expand Down Expand Up @@ -1134,9 +1140,9 @@ def create_index(
modality = self.meta["modality"]

if modality == "image":
if indexed_field is not None and indexed_field != "_blob_hash":
logger.warning("Ignoring user-provided indexed_field for image datasets. Using _blob_hash.")
indexed_field = "_blob_hash"
if indexed_field is not None:
logger.warning("Ignoring indexed_field for image datasets. Only _blob_hash is supported.")

colorable_fields = []

Expand All @@ -1150,6 +1156,7 @@ def create_index(
logger.warning(
"You did not specify the `topic_label_field` option in your topic_model, your dataset will not contain auto-labeled topics."
)

build_template = {
"project_id": self.id,
"index_name": name,
Expand All @@ -1160,21 +1167,11 @@ def create_index(
"model_hyperparameters": None,
"nearest_neighbor_index": "HNSWIndex",
"nearest_neighbor_index_hyperparameters": json.dumps({"space": "l2", "ef_construction": 100, "M": 16}),
"projection": "NomicProject",
"projection_hyperparameters": json.dumps(
{
"n_neighbors": projection.n_neighbors,
"n_epochs": projection.n_epochs,
"spread": projection.spread,
"local_neighborhood_size": projection.local_neighborhood_size,
"rho": projection.rho,
"model": projection.model,
}
),
"projection_hyperparameters": json.dumps(projection_hyperparameters),
Comment thread
wilsonjr marked this conversation as resolved.
"topic_model_hyperparameters": json.dumps(
{
"build_topic_model": topic_model.build_topic_model,
"community_description_target_field": topic_model.topic_label_field, # TODO change key to topic_label_field post v0.0.85
"community_description_target_field": topic_model.topic_label_field,
"cluster_method": topic_model.cluster_method,
"enforce_topic_hierarchy": topic_model.enforce_topic_hierarchy,
}
Expand All @@ -1186,9 +1183,13 @@ def create_index(
}
),
}
if projection is not None and projection_options is not None:
build_template["projection"] = (
projection_options.model if projection_options.model else "nomic-project-v2"
)
build_template["projection_hyperparameters"] = json.dumps(projection_hyperparameters)
Comment thread
mcembalest marked this conversation as resolved.
Outdated

elif modality == "text" or modality == "image":
# find the index id of the index with name reuse_embeddings_from_index
reuse_embedding_from_index_id = None
indices = self.indices
if reuse_embeddings_from_index is not None:
Expand Down Expand Up @@ -1239,22 +1240,12 @@ def create_index(
),
"nearest_neighbor_index": "HNSWIndex",
"nearest_neighbor_index_hyperparameters": json.dumps({"space": "l2", "ef_construction": 100, "M": 16}),
"projection": "NomicProject",
"projection_hyperparameters": json.dumps(
{
"n_neighbors": projection.n_neighbors,
"n_epochs": projection.n_epochs,
"spread": projection.spread,
"local_neighborhood_size": projection.local_neighborhood_size,
"rho": projection.rho,
"model": projection.model,
}
),
"projection_hyperparameters": json.dumps(projection_hyperparameters),
Comment thread
wilsonjr marked this conversation as resolved.
"topic_model_hyperparameters": json.dumps(
{
"build_topic_model": topic_model.build_topic_model,
"community_description_target_field": topic_field,
"cluster_method": topic_model.build_topic_model,
"cluster_method": topic_model.cluster_method,
Comment thread
mcembalest marked this conversation as resolved.
"enforce_topic_hierarchy": topic_model.enforce_topic_hierarchy,
}
),
Expand All @@ -1265,6 +1256,11 @@ def create_index(
}
),
}
if projection is not None and projection_options is not None:
build_template["projection"] = (
projection_options.model if projection_options.model else "nomic-project-v2"
)
build_template["projection_hyperparameters"] = json.dumps(projection_hyperparameters)
Comment thread
mcembalest marked this conversation as resolved.
Outdated

response = requests.post(
self.atlas_api_path + "/v1/project/index/create",
Expand Down
3 changes: 0 additions & 3 deletions nomic/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,5 @@
DEFAULT_LARGE_PROJECTION_EPOCHS = 128
DEFAULT_INDEX_N_NEIGHBORS = 32
DEFAULT_PROJECTION_RHO = 0.2

DEFAULT_PROJECTION_MODEL = "nomic-project-v1"

DEFAULT_PROJECTION_SPREAD = 1.0
DEFAULT_DUPLICATE_THRESHOLD = 0.1
Loading