release: v1.0.0 — word cloud, revisions, statistics, table of contents

bysiber · bysiber · commit f080ca4db5e0 · 2026-02-25T09:43:15.000+03:00
v1.0.0 Highlights:
- Word Cloud &amp; Frequency Analysis: TF-IDF, stop words, HTML/CSV/markdown output
- Document Revision Tracking: LCS diff, rollback, changelog, merge histories
- Comprehensive Statistics: 25+ metrics, Flesch-Kincaid/ARI reading level
- Table of Contents: auto-generate, inject, numbered, HTML output
- 136 public API exports (+27 new)
- 1126 tests passing
- ~19,000 lines of code
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.0.0] - 2025-02-25
+
+### Added
+
+- **Word Cloud & Frequency Analysis** (`wordcloud.py`): Generate word frequency data and cloud visualizations. `WordFrequency` dataclass with count, frequency, rank, TF-IDF, weight. `WordCloudData` with multiple output formats (markdown table, inline HTML cloud, CSV, size map). `generate_word_cloud()` with 130+ built-in stop words, configurable max_words, min_length, min_count, custom stop words. `compare_word_clouds()` for frequency distribution comparison. `tfidf_cloud()` for multi-document TF-IDF analysis. Markdown stripping and code block/URL removal in tokenizer.
+- **Document Revision Tracking** (`revisions.py`): Track changes between document versions with full history management. `Revision` with SHA-256 content hashing, word/line counts. `RevisionDiff` with LCS-based diff algorithm, unified diff output, markdown format. `RevisionHistory` with add/get/rollback/changelog/statistics. `compute_diff()` with modification detection (adjacent delete+add merging). `track_changes()` for quick two-version comparison. `merge_revisions()` with chronological ordering and deduplication.
+- **Comprehensive Statistics** (`statistics.py`): 25+ document metrics with markdown awareness. `TextStatistics` covering characters, words, sentences, paragraphs, vocabulary richness, hapax legomena, reading/speaking time (238/150 WPM). `compare_statistics()` for side-by-side document comparison with diff. `vocabulary_analysis()` with frequency distribution, rare words, type-token ratio. `section_statistics()` for per-heading breakdown. `reading_level()` with Flesch-Kincaid Grade Level and Automated Readability Index.
+- **Table of Contents** (`toc.py`): Generate, customize, and inject table of contents from markdown headings. `TocEntry` with auto-anchor slugification, depth tracking. `TableOfContents` with flat view, level filtering, max_depth. Multiple output formats: markdown, numbered markdown (hierarchical 1, 1.1, 1.2), HTML. `extract_toc()` with duplicate anchor handling. `inject_toc()` with marker-based or auto-placement insertion. `merge_tocs()` for combining multiple ToCs.
+- **New Exports**: 27 new public API exports. Total public API: 136 exports.
+
 ## [0.9.0] - 2025-02-25
 
 ### Added
diff --git a/README.md b/README.md
@@ -515,7 +515,11 @@ tracker.complete()
 | Bibliography mgmt | APA/MLA/BibTeX + auto-extraction | No |
 | Sentiment analysis | Lexicon-based + bias detection | No |
 | Cross-referencing | Sections/figures/tables + validation | No |
-| Lines of code | ~16,000 | ~10,000+ |
+| Word cloud | TF-IDF, frequency analysis, HTML output | No |
+| Revision tracking | LCS diff, rollback, changelog | No |
+| Document statistics | 25+ metrics, reading level, vocabulary | No |
+| Table of contents | Auto-generate, inject, numbered, HTML | No |
+| Lines of code | ~19,000 | ~10,000+ |
 
 deepworm is intentionally simple. If you need a web UI, multi-agent orchestration, or enterprise features, use gpt-researcher. If you want a research tool that just works, use deepworm.
 
@@ -586,6 +590,10 @@ deepworm is intentionally simple. If you need a web UI, multi-agent orchestratio
 - **Bibliography management** — APA, MLA, BibTeX formatting with auto-extraction
 - **Sentiment analysis** — lexicon-based sentiment, tone, and bias detection
 - **Cross-referencing** — internal section/figure/table references with validation
+- **Word cloud** — word frequency analysis, TF-IDF, HTML cloud, CSV export
+- **Revision tracking** — LCS-based diff, rollback, changelog, merge histories
+- **Document statistics** — 25+ metrics, Flesch-Kincaid/ARI reading level, vocabulary analysis
+- **Table of contents** — auto-generate from headings, inject with markers, numbered, HTML
 
 ## License
 
diff --git a/deepworm/__init__.py b/deepworm/__init__.py
@@ -2,7 +2,7 @@
 
 import logging
 
-__version__ = "0.9.0"
+__version__ = "1.0.0"
 
 from .annotations import AnnotationSet, AnnotationType, annotate_report, auto_annotate, extract_annotations
 from .async_api import AsyncResearcher, async_research
@@ -36,12 +36,16 @@
 from .readability import ReadabilityResult, analyze_readability
 from .references import Reference, Bibliography, extract_references, create_reference, inject_bibliography, merge_bibliographies
 from .researcher import DeepResearcher
+from .revisions import Revision, Change, RevisionDiff, RevisionHistory, compute_diff, create_revision, create_history, track_changes, merge_revisions
 from .scoring import QualityScore, score_report
 from .sentiment import SentimentScore, SentimentReport, ToneAnalysis, analyze_sentiment, analyze_tone, analyze_report_sentiment, sentiment_diff
 from .similarity import SimilarityResult, compare_texts, cosine_similarity, detect_plagiarism, find_similar
+from .statistics import TextStatistics, ComparisonResult, compute_statistics, compare_statistics, vocabulary_analysis, section_statistics, reading_level
 from .summary import Summary, extract_key_findings, extract_topics, summarize
 from .timeline import Timeline, TimelineEvent, extract_timeline, create_timeline, compare_timelines
+from .toc import TocEntry, TableOfContents, extract_toc, generate_toc, inject_toc, merge_tocs
 from .validator import ValidationResult, validate_topic
+from .wordcloud import WordFrequency, WordCloudData, generate_word_cloud, compare_word_clouds, tfidf_cloud
 
 __all__ = [
     "APIKeyError",
@@ -53,6 +57,8 @@
     "BatchStatus",
     "BatchTask",
     "Bibliography",
+    "Change",
+    "ComparisonResult",
     "ConfigError",
     "ContentExtractionError",
     "CredibilityReport",
@@ -89,16 +95,24 @@
     "ReportOutline",
     "ResearchPlan",
     "ResearchStage",
+    "Revision",
+    "RevisionDiff",
+    "RevisionHistory",
     "SearchError",
     "SentimentReport",
     "SentimentScore",
     "SessionError",
     "SimilarityResult",
     "Summary",
+    "TableOfContents",
+    "TextStatistics",
     "Timeline",
     "TimelineEvent",
+    "TocEntry",
     "ToneAnalysis",
     "ValidationResult",
+    "WordCloudData",
+    "WordFrequency",
     "__version__",
     "add_footnotes",
     "analyze_readability",
@@ -110,11 +124,17 @@
     "auto_annotate",
     "batch_export",
     "build_crossref_index",
+    "compare_statistics",
     "compare_texts",
     "compare_timelines",
+    "compare_word_clouds",
+    "compute_diff",
+    "compute_statistics",
     "cosine_similarity",
     "create_batch",
+    "create_history",
     "create_reference",
+    "create_revision",
     "create_timeline",
     "detect_plagiarism",
     "estimate_complexity",
@@ -127,32 +147,43 @@
     "extract_references",
     "extract_tags",
     "extract_timeline",
+    "extract_toc",
     "extract_topics",
     "find_similar",
     "generate_list_of_figures",
     "generate_list_of_tables",
     "generate_outline",
     "generate_plan",
+    "generate_toc",
+    "generate_word_cloud",
     "get_language",
     "inject_bibliography",
     "inject_crossrefs",
     "inject_glossary",
+    "inject_toc",
     "list_languages",
     "markdown_to_notion",
     "merge_bibliographies",
     "merge_footnotes",
+    "merge_revisions",
+    "merge_tocs",
     "outline_from_report",
+    "reading_level",
     "renumber_footnotes",
     "research",
     "research_chain",
     "run_batch",
     "score_report",
     "score_source",
     "score_sources",
+    "section_statistics",
     "sentiment_diff",
     "strip_footnotes",
     "summarize",
+    "tfidf_cloud",
+    "track_changes",
     "validate_topic",
+    "vocabulary_analysis",
 ]
 
 # Set up default logging (NullHandler to avoid "No handlers" warnings)
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "deepworm"
-version = "0.9.0"
+version = "1.0.0"
 description = "AI-powered deep research agent. Open-source alternative to OpenAI Deep Research."
 readme = "README.md"
 license = {text = "MIT"}