Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add method to compute repo and resource path for a PURL
Signed-off-by: Keshav Priyadarshi <git@keshav.space>
  • Loading branch information
keshav-space committed Nov 24, 2025
commit d9086ccb400ef0b9c52469eac8ef91443f1a7d3c
104 changes: 77 additions & 27 deletions aboutcode/federated/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,16 @@
# See https://aboutcode.org for more information about our open source projects.
#

from dataclasses import asdict
from dataclasses import dataclass
from dataclasses import field as datafield
from hashlib import sha256
from pathlib import Path
from pathlib import PurePosixPath
from typing import Any
from typing import Iterable
from typing import Optional
from typing import Tuple
from typing import Union
from urllib.parse import quote
from urllib.parse import urljoin
from urllib.parse import urlsplit

import requests
Expand Down Expand Up @@ -247,7 +245,7 @@
Example of repo and dir names
-----------------------------

With 4 dirs per repo, we get 256 repos, like tehse
With 4 dirs per repo, we get 256 repos, like these

purls-npm-0000
npm-0000
Expand Down Expand Up @@ -367,12 +365,12 @@
- unlock the cluster.

We may need to keep the old and new Clusters around too, and may need to add a
simple DataCluster version suffix in Cluter names, and a way to redirect from an
simple DataCluster version suffix in Cluster names, and a way to redirect from an
old frozen, inactive DataCluster to a new rebalanced one.

It may even be possible to continue writing to a cluster as long as writing is
done in two places until the split is completed. In practice split should be
reasonbly rare and reasonably fast, making this a lesser issue.
reasonably rare and reasonably fast, making this a lesser issue.

It is also possible to change the PURL hashid range for a DataCluster, say going
from 1024 to 2049, 4096 or 8192. This would imply moving all the files around
Expand Down Expand Up @@ -416,7 +414,7 @@ def get_package_base_dir(purl: Union[PackageURL, str]):
@dataclass
class DataFederation:
"""
A data federation is the root object and holds theconfiguration defining its
A data federation is the root object and holds the configuration defining its
data clusters, data kinds, PURL types and data repositories.
"""

Expand Down Expand Up @@ -564,7 +562,7 @@ def from_url(
headers = {"User-Agent": "AboutCode/FederatedCode"}
response = requests.get(url=rcf_url, headers=headers)
if not response.ok:
raise Exception(f"Failed to fetch Feration config: {rcf_url}")
raise Exception(f"Failed to fetch Federation config: {rcf_url}")

return cls.from_yaml_config(
name=name,
Expand Down Expand Up @@ -610,7 +608,7 @@ def to_dict(self):

def to_yaml(self):
"""
Return a YAMML text string for this federation configuration.
Return a YAML text string for this federation configuration.
"""
return saneyaml.dump(self.to_dict())

Expand Down Expand Up @@ -684,7 +682,7 @@ def get_local_datafile(self, data_kind: str, purl: Union[str, PackageURL]) -> "L

@dataclass
class LocalDataFile:
"""A local data file storeed optionally in a GitRepo"""
"""A local data file stored optionally in a GitRepo"""

path: Path
git_repo: "GitRepo" = None
Expand All @@ -706,10 +704,10 @@ class DataCluster:
# include directory and repository.
#
# For instance for a purls.yml file stored for each package:
# {/namespace}{/name}/purls.yml
# {namespace}/{name}/purls.yml
#
# For a scancode.json file stored for each package version:
# {/namespace}{/name}{/version}/scancode.json
# {namespace}/{name}/{version}/scancode.json
datafile_path_template: str

# list of unique PurlTypeConfig for types stored in this data cluster.
Expand Down Expand Up @@ -765,7 +763,7 @@ def populate_repos(self):

for ptc in self.purl_type_configs:
drbpt[ptc.purl_type] = [repo for repo in ptc.get_repos(data_kind=kind)]

def populate_configs(self):
for ptc in self.purl_type_configs:
self._configs_by_purl_type[ptc.purl_type] = ptc
Expand Down Expand Up @@ -840,6 +838,58 @@ def get_local_datafile(self, purl: Union[str, PackageURL]) -> LocalDataFile:
"""
raise NotImplementedError()

Comment thread
keshav-space marked this conversation as resolved.
def get_config(self, purl_type: str) -> "PurlTypeConfig":
"""
Return a PurlTypeConfig for this purl type.
"""
if purl_type not in self._configs_by_purl_type:
return self._configs_by_purl_type["default"]
return self._configs_by_purl_type[purl_type]

def get_datafile_relative_path(self, purl: Union[str, PackageURL]) -> str:
"""
Return the datfile path relative to the root of a cluster directory
given a PURL.
"""
purl = as_purl(purl=purl)

if not purl.version and "{version}" in self.datafile_path_template:
raise ValueError(
f"DataCluster '{self.data_kind}' needs PackageURL with version to generate path."
)

return self.datafile_path_template.format(
namespace=f"/{purl.namespace}" if purl.namespace else "",
name=purl.name,
version=purl.version,
)

def get_repo_and_dir_hash(self, purl: Union[str, PackageURL]) -> Tuple[str, str]:
"""
Return the repository hash and directory hash given a PURL.
"""
purl = as_purl(purl=purl)
ptc = self.get_config(purl.type)
purl_hashid = compute_purl_hash(purl=purl)
purl_hash = int(purl_hashid)
repo_hash = purl_hash - (purl_hash % ptc.numbers_of_dirs_per_repo)
return f"{repo_hash:04}", purl_hashid

def get_datafile_repo_and_path(self, purl: Union[str, PackageURL]) -> Tuple[str, str]:
"""
Return the repository name and relative path to the datafile of the data kind stored
in this cluster given a PURL.
"""
purl = as_purl(purl)
repo_hash, dir_hash = self.get_repo_and_dir_hash(purl)
relative_datafile_path = self.get_datafile_relative_path(purl)

directory_name = f"{purl.type}-{dir_hash}"
repository_name = f"{self.data_kind}-{purl.type}-{repo_hash}"
datafile_path = f"{directory_name}{relative_datafile_path}"

return repository_name, datafile_path


@dataclass
class PurlTypeConfig:
Expand Down Expand Up @@ -928,7 +978,7 @@ def get_repos(self, data_kind: str) -> Iterable["DataRepository"]:
@classmethod
def default_config(cls) -> "PurlTypeConfig":
"""
Return the default used when nothing is speced for a type
Return the default used when nothing is specified for a type
"""
return cls(
purl_type="default",
Expand Down Expand Up @@ -1076,7 +1126,7 @@ def cluster_preset():
DataCluster(
data_kind="purls",
description="List of fully qualified PURL strings for a package, sorted by version.",
datafile_path_template="{/namespace}/{name}/purls.yml",
datafile_path_template="{namespace}/{name}/purls.yml",
purl_type_configs=PurlTypeConfig.small_size_configs(),
data_schema_url="",
documentation_url="https://github.com/package-url/purl-spec/",
Expand Down Expand Up @@ -1110,7 +1160,7 @@ def cluster_preset():
data_kind="purldb",
description="PurlDB normalized metadata datafiles for each package "
"versions. Does not include fingerprints and symbols.",
datafile_path_template="{/namespace}/{name}/{version}/purldb.json",
datafile_path_template="{namespace}/{name}/{version}/purldb.json",
purl_type_configs=PurlTypeConfig.large_size_configs(),
data_schema_url="",
documentation_url="",
Expand All @@ -1121,7 +1171,7 @@ def cluster_preset():
data_kind="vulnerabilities",
description="VulnerableCode vulnerabilities for each package. "
"Also includes a separate vulnerabilities directory/",
datafile_path_template="{/namespace}/{name}/vulnerabilities.json",
datafile_path_template="{namespace}/{name}/vulnerabilities.json",
purl_type_configs=[PurlTypeConfig.default_config()],
data_schema_url="",
documentation_url="",
Expand All @@ -1130,7 +1180,7 @@ def cluster_preset():
DataCluster(
data_kind="security_advisories",
description="VulnerableCode security advisories for each package version.",
datafile_path_template="{/namespace}/{name}/{version}/advisories.json",
datafile_path_template="{namespace}/{name}/{version}/advisories.json",
purl_type_configs=[PurlTypeConfig.default_config()],
data_schema_url="",
documentation_url="",
Expand All @@ -1139,7 +1189,7 @@ def cluster_preset():
DataCluster(
data_kind="scancode_toolkit_scans",
description="scancode toolkit scans for each package version.",
datafile_path_template="{/namespace}/{name}/{version}/scancode-toolkit.json",
datafile_path_template="{namespace}/{name}/{version}/scancode-toolkit.json",
purl_type_configs=PurlTypeConfig.large_size_configs(),
data_schema_url="",
documentation_url="",
Expand All @@ -1148,7 +1198,7 @@ def cluster_preset():
DataCluster(
data_kind="scancode_fingerprints",
description="scancode_fingerprints for each package version.",
datafile_path_template="{/namespace}/{name}/{version}/scancode-fingerprints.json",
datafile_path_template="{namespace}/{name}/{version}/scancode-fingerprints.json",
purl_type_configs=PurlTypeConfig.large_size_configs(),
data_schema_url="",
documentation_url="",
Expand All @@ -1157,7 +1207,7 @@ def cluster_preset():
DataCluster(
data_kind="cyclonedx14_sboms",
description="CycloneDX v1.4 sboms for each package version",
datafile_path_template="{/namespace}/{name}/{version}/cyclonedx-14.json",
datafile_path_template="{namespace}/{name}/{version}/cyclonedx-14.json",
purl_type_configs=PurlTypeConfig.large_size_configs(),
data_schema_url="",
documentation_url="",
Expand All @@ -1166,7 +1216,7 @@ def cluster_preset():
DataCluster(
data_kind="cyclonedx15_sboms",
description="CycloneDX v1.5 sboms for each package version",
datafile_path_template="{/namespace}/{name}/{version}/cyclonedx-15.json",
datafile_path_template="{namespace}/{name}/{version}/cyclonedx-15.json",
purl_type_configs=PurlTypeConfig.large_size_configs(),
data_schema_url="",
documentation_url="",
Expand All @@ -1175,7 +1225,7 @@ def cluster_preset():
DataCluster(
data_kind="cyclonedx16_sboms",
description="CycloneDX v1.6 sboms for each package version",
datafile_path_template="{/namespace}/{name}/{version}/cyclonedx-16.json",
datafile_path_template="{namespace}/{name}/{version}/cyclonedx-16.json",
purl_type_configs=PurlTypeConfig.large_size_configs(),
data_schema_url="",
documentation_url="",
Expand All @@ -1184,7 +1234,7 @@ def cluster_preset():
DataCluster(
data_kind="spdx2_sboms",
description="SPDX version 2.x sboms for each package version",
datafile_path_template="{/namespace}/{name}/{version}/spdx-2.json",
datafile_path_template="{namespace}/{name}/{version}/spdx-2.json",
purl_type_configs=PurlTypeConfig.large_size_configs(),
data_schema_url="",
documentation_url="",
Expand All @@ -1193,7 +1243,7 @@ def cluster_preset():
DataCluster(
data_kind="atom_slices",
description="Atom slices for each package version",
datafile_path_template="{/namespace}/{name}/{version}/atom.json",
datafile_path_template="{namespace}/{name}/{version}/atom.json",
purl_type_configs=PurlTypeConfig.large_size_configs(),
data_schema_url="",
documentation_url="",
Expand All @@ -1203,7 +1253,7 @@ def cluster_preset():
data_kind="atom_vulnerable_slices",
description="Atom vulnerable_slices for each vulnerable package version",
# FIXME: need to qualify these with an advisory / CVE?
datafile_path_template="{/namespace}/{name}/{version}/atom-vulnerable.json",
datafile_path_template="{namespace}/{name}/{version}/atom-vulnerable.json",
purl_type_configs=PurlTypeConfig.large_size_configs(),
data_schema_url="",
documentation_url="",
Expand All @@ -1213,7 +1263,7 @@ def cluster_preset():
data_kind="openssf_security_scorecards",
description="OpenSSf security_scorecards for package",
# FIXME: need to qualify these with an advisory / CVE?
datafile_path_template="{/namespace}/{name}/security_scorecard.json",
datafile_path_template="{namespace}/{name}/security_scorecard.json",
purl_type_configs=PurlTypeConfig.medium_size_configs(),
data_schema_url="",
documentation_url="",
Expand Down
Loading