|
| 1 | +# Copyright 2025 Google LLC. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +#!/usr/bin/env python3 |
| 16 | +"""Validation for COMMUNITY_PROVIDERS.md plugin registry table.""" |
| 17 | + |
| 18 | +import os |
| 19 | +from pathlib import Path |
| 20 | +import re |
| 21 | +import re as regex_module |
| 22 | +import sys |
| 23 | +from typing import Dict, List, Tuple |
| 24 | + |
| 25 | +HEADER_ANCHOR = '| Plugin Name | PyPI Package |' |
| 26 | +END_MARKER = '<!-- ADD NEW PLUGINS ABOVE THIS LINE -->' |
| 27 | + |
| 28 | +# GitHub username/org and repo patterns |
| 29 | +GH_NAME = r'[-a-zA-Z0-9]+' # usernames/orgs allow hyphens |
| 30 | +GH_REPO = r'[-a-zA-Z0-9._]+' # repos allow ., _ |
| 31 | +GH_USER_LINK = rf'\[@{GH_NAME}\]\(https://github\.com/{GH_NAME}\)' |
| 32 | +GH_MULTI_USER = rf'^{GH_USER_LINK}(,\s*{GH_USER_LINK})*$' |
| 33 | + |
| 34 | +# Markdown link to a GitHub repo |
| 35 | +GH_REPO_LINK = rf'^\[[^\]]+\]\(https://github\.com/{GH_NAME}/{GH_REPO}\)$' |
| 36 | + |
| 37 | +# Issue link must point to LangExtract repository (issues only) |
| 38 | +LANGEXTRACT_ISSUE_LINK = ( |
| 39 | + r'^\[[^\]]+\]\(https://github\.com/google/langextract/issues/\d+\)$' |
| 40 | +) |
| 41 | + |
| 42 | +# PEP 503-ish normalized name (loose): lowercase letters/digits with - _ . separators |
| 43 | +PYPI_NORMALIZED = r'`[a-z0-9]([\-_.]?[a-z0-9]+)*`' |
| 44 | + |
| 45 | +MIN_DESC_LEN = 10 |
| 46 | + |
| 47 | + |
| 48 | +def normalize_pypi(name: str) -> str: |
| 49 | + """PEP 503 normalization for PyPI package names.""" |
| 50 | + return regex_module.sub(r'[-_.]+', '-', name.strip().lower()) |
| 51 | + |
| 52 | + |
| 53 | +def find_table_bounds(lines: List[str]) -> Tuple[int, int]: |
| 54 | + start = end = -1 |
| 55 | + for i, line in enumerate(lines): |
| 56 | + if HEADER_ANCHOR in line: |
| 57 | + start = i |
| 58 | + elif start >= 0 and END_MARKER in line: |
| 59 | + end = i |
| 60 | + break |
| 61 | + return start, end |
| 62 | + |
| 63 | + |
| 64 | +def parse_row(line: str) -> List[str]: |
| 65 | + # assumes caller trimmed line |
| 66 | + parts = [c.strip() for c in line.split('|')[1:-1]] |
| 67 | + return parts |
| 68 | + |
| 69 | + |
| 70 | +def validate(filepath: Path) -> bool: |
| 71 | + errors: List[str] = [] |
| 72 | + warnings: List[str] = [] |
| 73 | + |
| 74 | + content = filepath.read_text(encoding='utf-8') |
| 75 | + lines = content.splitlines() |
| 76 | + |
| 77 | + start, end = find_table_bounds(lines) |
| 78 | + if start < 0: |
| 79 | + errors.append('Could not find plugin registry table header.') |
| 80 | + print_report(errors, warnings) |
| 81 | + return False |
| 82 | + if end < 0: |
| 83 | + errors.append( |
| 84 | + 'Could not find end marker: <!-- ADD NEW PLUGINS ABOVE THIS LINE -->.' |
| 85 | + ) |
| 86 | + print_report(errors, warnings) |
| 87 | + return False |
| 88 | + |
| 89 | + rows: List[Dict] = [] |
| 90 | + seen_names = set() |
| 91 | + seen_pkgs = set() |
| 92 | + |
| 93 | + for i in range(start + 2, end): |
| 94 | + raw = lines[i].strip() |
| 95 | + if not raw: |
| 96 | + continue |
| 97 | + |
| 98 | + if not raw.startswith('|') or not raw.endswith('|'): |
| 99 | + errors.append( |
| 100 | + f"Line {i+1}: Not a valid table row (must start and end with '|')." |
| 101 | + ) |
| 102 | + continue |
| 103 | + |
| 104 | + cols = parse_row(raw) |
| 105 | + if len(cols) != 6: |
| 106 | + errors.append(f'Line {i+1}: Expected 6 columns, found {len(cols)}.') |
| 107 | + continue |
| 108 | + |
| 109 | + plugin, pypi, maint, repo, desc, issue_link = cols |
| 110 | + |
| 111 | + # Basic presence checks |
| 112 | + if not plugin: |
| 113 | + errors.append(f'Line {i+1}: Plugin Name is required.') |
| 114 | + |
| 115 | + if not re.fullmatch(PYPI_NORMALIZED, pypi): |
| 116 | + errors.append( |
| 117 | + f'Line {i+1}: PyPI package must be backticked and normalized (e.g.,' |
| 118 | + ' `langextract-provider-foo`).' |
| 119 | + ) |
| 120 | + |
| 121 | + if not re.fullmatch(GH_MULTI_USER, maint): |
| 122 | + errors.append( |
| 123 | + f'Line {i+1}: Maintainer must be one or more GitHub handles as links ' |
| 124 | + '(e.g., [@alice](https://github.com/alice) or comma-separated).' |
| 125 | + ) |
| 126 | + |
| 127 | + if not re.fullmatch(GH_REPO_LINK, repo): |
| 128 | + errors.append( |
| 129 | + f'Line {i+1}: GitHub Repo must be a Markdown link to a GitHub' |
| 130 | + ' repository.' |
| 131 | + ) |
| 132 | + |
| 133 | + if not desc or len(desc) < MIN_DESC_LEN: |
| 134 | + errors.append( |
| 135 | + f'Line {i+1}: Description must be at least {MIN_DESC_LEN} characters.' |
| 136 | + ) |
| 137 | + |
| 138 | + # Issue link is required and must point to LangExtract repo |
| 139 | + if not issue_link: |
| 140 | + errors.append(f'Line {i+1}: Issue Link is required.') |
| 141 | + elif not re.fullmatch(LANGEXTRACT_ISSUE_LINK, issue_link): |
| 142 | + errors.append( |
| 143 | + f'Line {i+1}: Issue Link must point to a LangExtract issue (e.g.,' |
| 144 | + ' [#123](https://github.com/google/langextract/issues/123)).' |
| 145 | + ) |
| 146 | + |
| 147 | + rows.append({ |
| 148 | + 'line': i + 1, |
| 149 | + 'plugin': plugin, |
| 150 | + 'pypi': pypi.strip('`').lower() if pypi else '', |
| 151 | + }) |
| 152 | + |
| 153 | + # Duplicate checks (case-insensitive and PEP 503 normalized) |
| 154 | + for r in rows: |
| 155 | + pn_key = r['plugin'].strip().casefold() |
| 156 | + pk_key = normalize_pypi(r['pypi']) if r['pypi'] else None |
| 157 | + |
| 158 | + if pn_key in seen_names: |
| 159 | + errors.append(f"Line {r['line']}: Duplicate Plugin Name '{r['plugin']}'.") |
| 160 | + seen_names.add(pn_key) |
| 161 | + |
| 162 | + if pk_key and pk_key in seen_pkgs: |
| 163 | + errors.append(f"Line {r['line']}: Duplicate PyPI Package '{r['pypi']}'.") |
| 164 | + if pk_key: |
| 165 | + seen_pkgs.add(pk_key) |
| 166 | + |
| 167 | + # Required alphabetical sorting check |
| 168 | + sorted_by_name = sorted(rows, key=lambda r: r['plugin'].casefold()) |
| 169 | + if [r['plugin'] for r in rows] != [r['plugin'] for r in sorted_by_name]: |
| 170 | + errors.append('Registry rows must be alphabetically sorted by Plugin Name.') |
| 171 | + |
| 172 | + # Guardrail: discourage leaving only the example entry |
| 173 | + if len(rows) == 1 and rows[0]['plugin'].lower().startswith('example'): |
| 174 | + warnings.append( |
| 175 | + 'The registry currently contains only the example row. Add real' |
| 176 | + ' providers above the marker.' |
| 177 | + ) |
| 178 | + |
| 179 | + print_report(errors, warnings) |
| 180 | + return not errors |
| 181 | + |
| 182 | + |
| 183 | +def print_report(errors: List[str], warnings: List[str]) -> None: |
| 184 | + if errors: |
| 185 | + print('❌ Validation failed:') |
| 186 | + for e in errors: |
| 187 | + print(f' • {e}') |
| 188 | + if warnings: |
| 189 | + print('⚠️ Warnings:') |
| 190 | + for w in warnings: |
| 191 | + print(f' • {w}') |
| 192 | + if not errors and not warnings: |
| 193 | + print('✅ Table format validation passed!') |
| 194 | + |
| 195 | + |
| 196 | +if __name__ == '__main__': |
| 197 | + path = Path('COMMUNITY_PROVIDERS.md') |
| 198 | + if len(sys.argv) > 1: |
| 199 | + path = Path(sys.argv[1]) |
| 200 | + if not path.exists(): |
| 201 | + print(f'❌ Error: File not found: {path}') |
| 202 | + sys.exit(1) |
| 203 | + ok = validate(path) |
| 204 | + sys.exit(0 if ok else 1) |
0 commit comments