From a52fe80b83173bc1d5936b8b8e5bcd28983df234 Mon Sep 17 00:00:00 2001 From: cdc-ap66 Date: Fri, 12 Dec 2025 17:30:36 +0000 Subject: [PATCH 1/4] version bump and release notes --- changelog.md | 14 +++++ docs/release_notes/v2025.12.10.md | 91 +++++++++++++++++++++++++++++++ pyproject.toml | 2 +- 3 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 docs/release_notes/v2025.12.10.md diff --git a/changelog.md b/changelog.md index 2bb0414..1088139 100644 --- a/changelog.md +++ b/changelog.md @@ -10,6 +10,20 @@ The versioning pattern is `YYYY.MM.DD.micro(a/b/{none if release}) ## [2025.12.10.0a] +### Summary + +**First Official Release!** 🎉 + +This release marks a significant milestone for the CFA DataOps project, providing a robust foundation for data cataloging, ETL pipelines, and reporting. It consolidates months of development into a unified, versioned package ready for broader adoption. + +**Key Highlights:** +- **Unified Data Access**: `datacat` interface for seamless dataset access. +- **Automated Reporting**: `reportcat` for generating interactive HTML reports from Jupyter notebooks. +- **Robust CLI Tools**: Manage catalogs and datasets effortlessly. +- **Flexible Data Loading**: Advanced version filtering for Pandas and Polars DataFrames. + +See the [Release Notes](docs/release_notes/v2025.12.10.md) for more details. + ### Updated - Added version matching to `BlobEndpoint.get_file_ext()` method for greater flexibility to changing file types over time. diff --git a/docs/release_notes/v2025.12.10.md b/docs/release_notes/v2025.12.10.md new file mode 100644 index 0000000..56da63f --- /dev/null +++ b/docs/release_notes/v2025.12.10.md @@ -0,0 +1,91 @@ +# Release Notes - v2025.12.10 + +We're thrilled to announce the first official release of the **CFA DataOps** project! This milestone version `2025.12.10.0` brings a comprehensive suite of tools for data cataloging, ETL pipelines, and reporting, designed to streamline data operations within the CFA environment. + +## 🚀 Highlights + +* **Unified Data Access**: Seamlessly access datasets across multiple catalogs using the `datacat` interface. +* **Automated Reporting**: Generate client-side rendering interactive HTML reports from Jupyter notebooks with `reportcat`. +* **Robust CLI Tools**: Manage catalogs, datasets, stages, and versions directly from the command line. +* **Flexible Data Loading**: Load data into Pandas or Polars DataFrames with advanced version filtering and selection. +* **Azure Blob Storage Integration**: Built-in support for reading/writing raw and transformed data to Azure Blob Storage. +* **Schema Validation**: Ensure data quality with rigorous schema validation for both raw and transformed datasets. + +## 🏗️ Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ CFA Data Science Ecosystem │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ Cloud OPS │────>│ Data OPS │────>│ Catalogs │ │ +│ │ │ │ │ │ │ │ +│ │ • Compute │ │ • Datacat │ │ • Public │ │ +│ │ • BLOB │ │ • Reportcat │ │ • Private │ │ +│ │ • Key Vault │ │ • Ledger │ │ • Team-spec. │ │ +│ │ • │ │ • Cat init │ │ • workflows │ │ +│ │ │ │ • │ │ • reports │ │ +│ └──────────────┘ └──────────────┘ └──────────────┘ │ +│ │ │ │ │ +│ └────────────────────┼────────────────────┘ │ +│ V │ +│ ┌──────────────────┐ │ +│ │ Data Scientists │ │ +│ │ & Applications │ │ +│ └──────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## 📋 Key Features + +### Data Management +* **Catalog Creation**: Initialize new dataset catalog libraries with standardized structures using `dataops_catalog_init`. +* **Multi-Catalog Support**: Install and manage multiple catalog libraries in the same Python environment. +* **Configuration-Driven ETL**: Define ETL pipelines using simple TOML configuration files. + +### Data Access & Versioning +* **Version Control**: Retrieve specific data versions using timestamp-based versioning. +* **Advanced Filtering**: Use conditional logic (e.g., `>2024.12.01,<2025.08`, `latest`, `~=2024/11`) to select data versions. +* **Local Download**: Download dataset versions to your local filesystem for offline analysis. + +### Reporting & Visualization +* **Jupyter Integration**: Author reports as Jupyter notebooks and convert them to interactive HTML. +* **Visualization Utilities**: Includes plotting functions for lines, points, and intervals, plus PDF report generation. + +## 🗃️ Existing and Growing Catalogs + +* **Public**: [https://github.com/CDCgov/cfa-catalog-pub](https://github.com/CDCgov/cfa-catalog-pub) +* **Private**: [https://github.com/cdcent/cfa-catalog-private](https://github.com/cdcent/cfa-catalog-private) + +## 🛠️ Usage Examples + +**List Available Datasets:** +```python +from cfa.dataops import datacat +print(datacat.__namespace_list__) +``` + +**Load a Dataframe with Version Filtering:** +```python +from cfa.dataops import datacat +df = datacat.public.my_dataset.load.get_dataframe(version=">2024.12.01,<2025.08") +``` + +**Generate a Report:** +```python +from cfa.dataops.reporting import reportcat +reportcat.examples.dataset_report_ipynb.nb_to_html_file('report.html') +``` + +## 📚 Documentation + +For more detailed information, please refer to our comprehensive documentation: +* [Data User Guide](https://cdcgov.github.io/cfa-dataops/data_user_guide/) +* [Data Developer Guide](https://cdcgov.github.io/cfa-dataops/data_developer_guide/) +* [Managing Catalogs](https://cdcgov.github.io/cfa-dataops/managing_catalogs/) +* [Report Generation](https://cdcgov.github.io/cfa-dataops/report_generation/) +* [CLI Tools](https://cdcgov.github.io/cfa-dataops/cli_tools/) + +--- +*Thank you to all the contributors who made this release possible!* diff --git a/pyproject.toml b/pyproject.toml index 742b660..a317b24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "cfa.dataops" -version = "2025.12.10.0a" +version = "2025.12.10.0" description = "Data cataloging, ETL, modeling, verification, and validation for CFA" authors = [ { name = "Phil Rogers", email = "ap66@cdc.gov" }, From c0a0998cb263915ede1c6dac0fe3475e8b8d2c65 Mon Sep 17 00:00:00 2001 From: cdc-ap66 Date: Fri, 12 Dec 2025 17:36:18 +0000 Subject: [PATCH 2/4] version update --- changelog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog.md b/changelog.md index 1088139..9cb4ace 100644 --- a/changelog.md +++ b/changelog.md @@ -8,7 +8,7 @@ The versioning pattern is `YYYY.MM.DD.micro(a/b/{none if release}) --- -## [2025.12.10.0a] +## [2025.12.10.0] ### Summary From 5ecfc5ba689a894e1ed623ebabc3c65ea557ea6e Mon Sep 17 00:00:00 2001 From: cdc-ap66 Date: Fri, 12 Dec 2025 17:50:43 +0000 Subject: [PATCH 3/4] simplifying --- .github/workflows/release.yaml | 77 ---------------------------------- 1 file changed, 77 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index cfaf5ce..23dcd20 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -17,103 +17,26 @@ jobs: with: fetch-depth: 0 - - name: Checkout release - run: | - git fetch origin release - git checkout release - - - name: Determine Version - id: get-version - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - VERSION="${GITHUB_REF#refs/tags/}" - echo "version=$VERSION" >> "$GITHUB_OUTPUT" - - - name: Determine If Tag Is From Release - id: is-release - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - TAG_COMMIT=$(git rev-parse ${{ github.ref }}) - BRANCHES=$(git branch -r --contains "$TAG_COMMIT") - - if echo "$BRANCHES" | grep -q 'origin/release'; then - echo "should_continue=true" >> "$GITHUB_OUTPUT" - else - echo "Exiting Workflow: Tag is not from release branch" - echo "IF intending to create a release from tag follow steps under 'Creating A Release' in the README file" - echo "should_continue=false" >> "$GITHUB_OUTPUT" - fi - - name: Setup Python - if: steps.is-release.outputs.should_continue == 'true' uses: actions/setup-python@v5 with: python-version: '3.10' - name: Install Poetry - if: steps.is-release.outputs.should_continue == 'true' run: | curl -sSL https://install.python-poetry.org | python3 - poetry --version - name: Install Dependencies - if: steps.is-release.outputs.should_continue == 'true' run: | poetry install --no-root --with dev - - name: Update pyproject.toml Version - if: steps.is-release.outputs.should_continue == 'true' - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - VERSION=${{ steps.get-version.outputs.version }} - - sed -i.bak -E "s/^\s*version\s*=\s*\"[^\"]+\"/version = \"${VERSION}\"/" pyproject.toml - - - name: Update Change Log - if: steps.is-release.outputs.should_continue == 'true' - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - VERSION=${{ steps.get-version.outputs.version }} - REPO_URL="https://github.com/${{ github.repository }}" - RELEASE_URL="$REPO_URL/releases/tag/$VERSION" - - awk -v tag="## [$VERSION]" -v url="$RELEASE_URL" ' - !done && /---/ { - print $0 - print "" - print tag - print url - done = 1 - next - } - { print $0 } - ' changelog.md > temp.md && mv temp.md changelog.md - - name: Build Wheel - if: steps.is-release.outputs.should_continue == 'true' run: | poetry build - name: Create Release - if: steps.is-release.outputs.should_continue == 'true' env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | gh release create ${{ github.ref }} dist/* --generate-notes --latest - - - name: Commit Change Log and pyproject.toml Updates - if: steps.is-release.outputs.should_continue == 'true' - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - VERSION=${{ steps.get-version.outputs.version }} - - git config user.name "github-actions" - git config user.email "github-actions@users.noreply.github.com" - git add pyproject.toml changelog.md - git commit -m "Update Version to ${{ steps.get-version.outputs.version }}" - git push origin release From 67a1dc9cbf6b5c84d919e5c7547581a7e1dddc0d Mon Sep 17 00:00:00 2001 From: cdc-ap66 Date: Fri, 12 Dec 2025 17:53:52 +0000 Subject: [PATCH 4/4] one last simplification --- .github/workflows/release.yaml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 23dcd20..7a08032 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -22,14 +22,13 @@ jobs: with: python-version: '3.10' - - name: Install Poetry + - name: Install poetry run: | - curl -sSL https://install.python-poetry.org | python3 - - poetry --version + pip install poetry>=2.0 - name: Install Dependencies run: | - poetry install --no-root --with dev + poetry install --with dev - name: Build Wheel run: |