Matched version checking fixed

apiiro · cognitivegears · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024
commit bf9f95d7c2e16d7c5c860fd974f854e8a9ae1737
diff --git a/src/analysis/heuristics.py b/src/analysis/heuristics.py
@@ -4,6 +4,7 @@
 import math
 from datetime import datetime, timezone
 from constants import Constants, DefaultHeuristics
+from common.logging_utils import is_debug_enabled, extra_context
 
 STG = f"{Constants.ANALYSIS} "
 # Repository signals scoring constants
@@ -97,6 +98,173 @@ def compute_repo_signals_score(mp):
 
     # Clamp the final score
     return max(REPO_SCORE_CLAMP_MIN, min(REPO_SCORE_CLAMP_MAX, score))
+
+def _clamp01(value):
+    """Clamp a numeric value into [0.0, 1.0]."""
+    try:
+        v = float(value)
+    except Exception:
+        return 0.0
+    return 0.0 if v < 0.0 else 1.0 if v > 1.0 else v
+
+def _norm_base_score(base):
+    """Normalize an existing base score (already expected to be 0..1, but clamp defensively)."""
+    if base is None:
+        return None
+    try:
+        return _clamp01(float(base))
+    except Exception:
+        return None
+
+def _norm_repo_stars(stars):
+    """Normalize repository stars to [0,1] using a log scale that saturates around 10^3."""
+    if stars is None:
+        return None
+    try:
+        s = float(stars)
+        if s < 0:
+            s = 0.0
+        # Matches design: min(1.0, log10(stars+1)/3.0) — ~1.0 around 1k stars
+        return min(1.0, max(0.0, math.log10(s + 1.0) / 3.0))
+    except Exception:
+        return None
+
+def _norm_repo_contributors(contrib):
+    """Normalize repository contributors to [0,1], saturating at ~50 contributors."""
+    if contrib is None:
+        return None
+    try:
+        c = float(contrib)
+        if c < 0:
+            c = 0.0
+        return min(1.0, max(0.0, c / 50.0))
+    except Exception:
+        return None
+
+def _parse_iso_to_days(iso_ts):
+    """Parse ISO-8601 timestamp and return days since that time (int)."""
+    try:
+        if isinstance(iso_ts, str):
+            if iso_ts.endswith('Z'):
+                dt = datetime.fromisoformat(iso_ts[:-1])
+            else:
+                dt = datetime.fromisoformat(iso_ts)
+            if dt.tzinfo is None:
+                dt = dt.replace(tzinfo=timezone.utc)
+            now = datetime.now(timezone.utc)
+            return (now - dt).days
+    except Exception:
+        return None
+    return None
+
+def _norm_repo_last_activity(iso_ts):
+    """Normalize last activity recency into [0,1] using tiered thresholds."""
+    if not iso_ts:
+        return None
+    days = _parse_iso_to_days(iso_ts)
+    if days is None:
+        return None
+    if days <= 30:
+        return 1.0
+    if days <= 365:
+        return 0.6
+    if days <= 730:
+        return 0.3
+    return 0.0
+
+def _norm_bool(flag):
+    """Normalize boolean to [0,1]; None -> None (missing)."""
+    if flag is None:
+        return None
+    return 1.0 if bool(flag) else 0.0
+
+def _norm_version_match(vm):
+    """Normalize version match dict to [0,1]. True match => 1.0; else 0.0; None => missing."""
+    if vm is None:
+        return None
+    try:
+        return 1.0 if bool(vm.get('matched', False)) else 0.0
+    except Exception:
+        return None
+
+def compute_final_score(mp):
+    """Compute the final normalized score in [0,1] with per-heuristic breakdown and weights.
+
+    Normalized inputs (each in [0,1], None if missing):
+      - base_score (existing pkg.score if provided)
+      - repo_version_match
+      - repo_stars
+      - repo_contributors
+      - repo_last_activity
+      - repo_present_in_registry
+
+    Default weights (sum to 1.0 when all present; re-normalized when some are missing):
+      - base_score: 0.30
+      - repo_version_match: 0.30
+      - repo_stars: 0.15
+      - repo_contributors: 0.10
+      - repo_last_activity: 0.10
+      - repo_present_in_registry: 0.05
+
+    Returns:
+      tuple(final_score: float, breakdown: dict, weights_used: dict)
+    """
+    # Raw values
+    raw = {
+        'base_score': getattr(mp, 'score', None),
+        'repo_version_match': getattr(mp, 'repo_version_match', None),
+        'repo_stars': getattr(mp, 'repo_stars', None),
+        'repo_contributors': getattr(mp, 'repo_contributors', None),
+        'repo_last_activity': getattr(mp, 'repo_last_activity_at', None),
+        'repo_present_in_registry': getattr(mp, 'repo_present_in_registry', None),
+    }
+
+    # Normalized values
+    norm = {
+        'base_score': _norm_base_score(raw['base_score']),
+        'repo_version_match': _norm_version_match(raw['repo_version_match']),
+        'repo_stars': _norm_repo_stars(raw['repo_stars']),
+        'repo_contributors': _norm_repo_contributors(raw['repo_contributors']),
+        'repo_last_activity': _norm_repo_last_activity(raw['repo_last_activity']),
+        # Treat default/unknown False as missing to avoid penalizing base-only scenarios
+        'repo_present_in_registry': _norm_bool(raw['repo_present_in_registry']),
+    }
+    # If present_in_registry is False (normalized 0.0) and no normalized repo URL exists,
+    # consider it missing (None) for scoring/weight renormalization purposes.
+    if norm['repo_present_in_registry'] == 0.0 and getattr(mp, 'repo_url_normalized', None) is None:
+        norm['repo_present_in_registry'] = None
+
+    # Default weights
+    weights = {
+        'base_score': 0.30,
+        'repo_version_match': 0.30,
+        'repo_stars': 0.15,
+        'repo_contributors': 0.10,
+        'repo_last_activity': 0.10,
+        'repo_present_in_registry': 0.05,
+    }
+
+    # Re-normalize weights to only those metrics that are present (norm != None)
+    available = [k for k, v in norm.items() if v is not None]
+    total_w = sum(weights[k] for k in available) if available else 0.0
+    if total_w <= 0.0:
+        breakdown = {k: {'raw': raw[k], 'normalized': norm[k]} for k in norm.keys()}
+        return 0.0, breakdown, {}
+
+    weights_used = {k: weights[k] / total_w for k in available}
+
+    # Weighted sum ensures range [0,1] since each component is clamped and weights sum to 1
+    final = 0.0
+    for k in available:
+        val = norm.get(k)
+        if val is None:
+            continue
+        final += float(val) * weights_used[k]
+    final = _clamp01(final)
+
+    breakdown = {k: {'raw': raw[k], 'normalized': norm[k]} for k in norm.keys()}
+    return final, breakdown, weights_used
+
 def combobulate_min(pkgs):
     """Run to check the existence of the packages in the registry.
 
@@ -112,15 +280,47 @@ def combobulate_heur(pkgs):
     Args:
         pkgs (list): List of packages to check.
     """
+    logger = logging.getLogger(__name__)
     for x in pkgs:
         test_exists(x)
         if x.exists is True:
-            # Add repository signals score to existing score
-            repo_score = compute_repo_signals_score(x)
-            if x.score is not None:
-                x.score += repo_score
-            else:
-                x.score = repo_score
+            # Compute final normalized score in [0,1] using available metrics
+            final_score, breakdown, weights_used = compute_final_score(x)
+            x.score = final_score
+            if is_debug_enabled(logger):
+                logger.debug(
+                    "Heuristics score breakdown",
+                    extra=extra_context(
+                        event="analysis",
+                        component="heuristics",
+                        action="score_breakdown",
+                        package_name=str(x),
+                        final_score=final_score,
+                        weights=weights_used,
+                        breakdown=breakdown,
+                    ),
+                )
+            # Emit [ANALYSIS] lines for repository signals
+            try:
+                if getattr(x, "repo_stars", None) is not None:
+                    logging.info("%s.... repository stars: %s.", STG, str(x.repo_stars))
+                if getattr(x, "repo_contributors", None) is not None:
+                    logging.info("%s.... repository contributors: %s.", STG, str(x.repo_contributors))
+                if getattr(x, "repo_last_activity_at", None):
+                    _days = _parse_iso_to_days(x.repo_last_activity_at)
+                    if _days is not None:
+                        logging.info("%s.... repository last activity %d days ago.", STG, int(_days))
+                if getattr(x, "repo_present_in_registry", None) is not None:
+                    logging.info("%s.... repository present in registry: %s.", STG, str(x.repo_present_in_registry))
+                if getattr(x, "repo_version_match", None) is not None:
+                    try:
+                        _matched = bool(x.repo_version_match.get('matched', False))
+                        logging.info("%s.... repository version match: %s.", STG, "yes" if _matched else "no")
+                    except Exception:
+                        logging.info("%s.... repository version match: unavailable.", STG)
+            except Exception:
+                # Do not break analysis on logging issues
+                pass
             test_score(x)
             test_timestamp(x)
             test_version_count(x)

diff --git a/src/depgate.py b/src/depgate.py
@@ -110,6 +110,11 @@ def export_csv(instances, path):
         "Risk: Min Versions",
         "Risk: Too New",
         "Risk: Any Risks",
+        "repo_stars",
+        "repo_contributors",
+        "repo_last_activity",
+        "repo_present_in_registry",
+        "repo_version_match",
     ]
     rows = [headers]
     for x in instances:
@@ -140,6 +145,11 @@ def export_json(instances, path):
             "score": x.score,
             "versionCount": x.version_count,
             "createdTimestamp": x.timestamp,
+            "repo_stars": x.repo_stars,
+            "repo_contributors": x.repo_contributors,
+            "repo_last_activity": x.repo_last_activity_at,
+            "repo_present_in_registry": (None if (getattr(x, "repo_url_normalized", None) is None and x.repo_present_in_registry is False) else x.repo_present_in_registry),
+            "repo_version_match": x.repo_version_match,
             "risk": {
                 "hasRisk": x.has_risk(),
                 "isMissing": x.risk_missing,

diff --git a/src/metapackage.py b/src/metapackage.py
@@ -66,6 +66,9 @@ def listall(self):
         Returns:
             list: List of all the attributes of the class.
         """
+        def nv(v):
+            return "" if v is None else v
+
         lister = []
         lister.append(self._pkg_name)
         lister.append(self._pkg_type)
@@ -79,6 +82,25 @@ def listall(self):
         lister.append(self._risk_min_versions)
         lister.append(self._risk_too_new)
         lister.append(self.has_risk())
+
+        # New repo_* CSV columns (empty string for missing)
+        lister.append(nv(self._repo_stars))
+        lister.append(nv(self._repo_contributors))
+        lister.append(nv(self._repo_last_activity_at))
+        # CSV default handling: empty when not set; if explicitly False but no normalized repo URL,
+        # treat as missing for CSV (empty)
+        if (self._repo_present_in_registry is False) and (self._repo_url_normalized is None):
+            lister.append("")
+        else:
+            lister.append(nv(self._repo_present_in_registry))
+        if self._repo_version_match is None:
+            lister.append("")
+        else:
+            try:
+                lister.append(bool(self._repo_version_match.get('matched')))
+            except Exception:  # defensive: malformed dict
+                lister.append("")
+
         return lister
 
     @staticmethod
@@ -405,7 +427,7 @@ def repo_present_in_registry(self):
         """Property for repository presence in registry.
 
         Returns:
-            bool: True if repository URL is present in package registry
+            bool or None: True if repository URL is present in package registry; None if unknown
         """
         return self._repo_present_in_registry
 
@@ -418,7 +440,7 @@ def repo_resolved(self):
         """Property for repository resolution status.
 
         Returns:
-            bool: True if repository URL has been resolved and validated
+            bool or None: True if repository URL has been resolved and validated; None if unknown
         """
         return self._repo_resolved
 

diff --git a/src/registry/maven/client.py b/src/registry/maven/client.py
@@ -10,7 +10,7 @@
 from typing import List
 
 from constants import ExitCodes, Constants
-from common.http_client import safe_get
+import common.http_client as http_client
 from common.logging_utils import extra_context, is_debug_enabled, Timer, safe_url, redact
 
 from .enrich import _enrich_with_repo  # Not used here but kept for parity if needed later
@@ -49,7 +49,7 @@ def recv_pkg_info(pkgs, url: str = Constants.REGISTRY_URL_MAVEN) -> None:
                 headers = {"Accept": "application/json", "Content-Type": "application/json"}
                 # Sleep to avoid rate limiting
                 time.sleep(0.1)
-                res = safe_get(url, context="maven", params=payload, headers=headers)
+                res = http_client.safe_get(url, context="maven", params=payload, headers=headers)
             except SystemExit:
                 # safe_get calls sys.exit on errors, so we need to catch and re-raise as exception
                 logger.error(

diff --git a/src/repository/provider_adapters.py b/src/repository/provider_adapters.py
@@ -65,9 +65,15 @@ def get_releases(self, owner: str, repo: str) -> List[Dict[str, str]]:
             repo: Repository name
 
         Returns:
-            List of release dictionaries
+            List of release dictionaries. Falls back to tags if releases are empty.
         """
-        return self.client.get_releases(owner, repo)
+        releases = self.client.get_releases(owner, repo)
+        if releases:
+            return releases
+
+        # Fallback: use tags when releases are unavailable to enable version matching
+        tags = self.client.get_tags(owner, repo)
+        return tags or []
 
 
 class GitLabProviderAdapter(ProviderClient):
@@ -123,6 +129,12 @@ def get_releases(self, owner: str, repo: str) -> List[Dict[str, str]]:
             repo: Project name
 
         Returns:
-            List of release dictionaries
+            List of release dictionaries. Falls back to tags if releases are empty.
         """
-        return self.client.get_releases(owner, repo)
+        releases = self.client.get_releases(owner, repo)
+        if releases:
+            return releases
+
+        # Fallback: use tags when releases are unavailable to enable version matching
+        tags = self.client.get_tags(owner, repo)
+        return tags or []