DataFog · sidmohan0 · Feb 1, 2026 · Feb 1, 2026
diff --git a/.coveragerc b/.coveragerc
@@ -1,12 +1,18 @@
 [run]
 source = datafog
-omit = 
+omit =
     */tests/*
     */test_*
     */__pycache__/*
     */venv/*
     */env/*
     setup.py
+    datafog/__init___lean.py
+    datafog/__init___original.py
+    datafog/main_lean.py
+    datafog/main_original.py
+    datafog/services/text_service_lean.py
+    datafog/services/text_service_original.py
 
 [report]
 exclude_lines =

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -38,13 +38,13 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y tesseract-ocr libtesseract-dev
 
-      - name: Install minimal dependencies to prevent segfault
+      - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install -e ".[dev]" 
+          pip install -e ".[dev]"
           pip install -r requirements-dev.txt
-          # Add only safe extras that don't include heavy ML dependencies
-          pip install -e ".[cli]"
+          pip install -e ".[nlp,cli]"
+          pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
 
       - name: Run test suite (ignore segfault during cleanup)
         run: |
@@ -86,9 +86,9 @@ jobs:
               exit(1)
           "
 
-      - name: Run coverage on core modules only  
+      - name: Run coverage
         run: |
-          python -m pytest tests/test_text_service.py tests/test_regex_annotator.py tests/test_anonymizer.py --cov=datafog --cov-report=xml --cov-config=.coveragerc
+          python -m pytest tests/ -v --ignore=tests/test_gliner_annotator.py --cov=datafog --cov-report=xml --cov-config=.coveragerc
 
       - name: Upload coverage
         uses: codecov/codecov-action@v4

diff --git a/README.md b/README.md
@@ -294,6 +294,29 @@ async def redact_pii_middleware(request, call_next):
 
 ---
 
+## Privacy & Telemetry
+
+DataFog collects **anonymous** usage telemetry to help us understand which features are used and prioritize development. This data contains:
+
+- Function and engine usage (e.g., "regex" vs "gliner")
+- Coarse performance buckets (e.g., "10-100ms"), never exact timings
+- Error class names only (e.g., "ImportError"), never error messages or stack traces
+- A one-way hashed machine identifier — no IP addresses, usernames, or file paths
+
+**No text content, PII, or personally identifiable information is ever collected.**
+
+To opt out, set either environment variable before running DataFog:
+
+```bash
+export DATAFOG_NO_TELEMETRY=1
+# or
+export DO_NOT_TRACK=1
+```
+
+Telemetry uses only Python's standard library (`urllib.request`) — no additional dependencies are installed. All sends are fire-and-forget in background threads and will never affect performance or raise exceptions.
+
+---
+
 ## Common Use Cases
 
 ### Enterprise

diff --git a/datafog/__init__.py b/datafog/__init__.py
@@ -149,6 +149,11 @@ def detect(text: str) -> list:
         >>> detect("Contact john@example.com")
         [{'type': 'EMAIL', 'value': 'john@example.com', 'start': 8, 'end': 24}]
     """
+    import time as _time
+
+    _start = _time.monotonic()
+
+    _lazy_import_regex_annotator()
     annotator = RegexAnnotator()
     # Use the structured output to get proper positions
     _, result = annotator.annotate_with_spans(text)
@@ -166,6 +171,27 @@ def detect(text: str) -> list:
                 }
             )
 
+    try:
+        from .telemetry import (
+            _get_duration_bucket,
+            _get_text_length_bucket,
+            track_function_call,
+        )
+
+        _duration = (_time.monotonic() - _start) * 1000
+        entity_types = list({e["type"] for e in entities})
+        track_function_call(
+            function_name="detect",
+            module="datafog",
+            engine="regex",
+            text_length_bucket=_get_text_length_bucket(len(text)),
+            entity_count=len(entities),
+            entity_types_found=entity_types,
+            duration_ms_bucket=_get_duration_bucket(_duration),
+        )
+    except Exception:
+        pass
+
     return entities
 
 
@@ -190,6 +216,10 @@ def process(text: str, anonymize: bool = False, method: str = "redact") -> dict:
             'findings': [{'type': 'EMAIL', 'value': 'john@example.com', ...}]
         }
     """
+    import time as _time
+
+    _start = _time.monotonic()
+
     findings = detect(text)
 
     result = {"original": text, "findings": findings}
@@ -216,6 +246,21 @@ def process(text: str, anonymize: bool = False, method: str = "redact") -> dict:
 
         result["anonymized"] = anonymized
 
+    try:
+        from .telemetry import _get_duration_bucket, track_function_call
+
+        _duration = (_time.monotonic() - _start) * 1000
+        track_function_call(
+            function_name="process",
+            module="datafog",
+            anonymize=anonymize,
+            method=method,
+            entity_count=len(findings),
+            duration_ms_bucket=_get_duration_bucket(_duration),
+        )
+    except Exception:
+        pass
+
     return result
 
 

diff --git a/datafog/client.py b/datafog/client.py
@@ -48,8 +48,26 @@ def scan_image(
     try:
         results = asyncio.run(ocr_client.run_ocr_pipeline(image_urls=image_urls))
         typer.echo(f"OCR Pipeline Results: {results}")
+
+        try:
+            from .telemetry import track_function_call
+
+            track_function_call(
+                function_name="scan_image",
+                module="datafog.client",
+                source="cli",
+                batch_size=len(image_urls),
+            )
+        except Exception:
+            pass
     except Exception as e:
         logging.exception("Error in run_ocr_pipeline")
+        try:
+            from .telemetry import track_error
+
+            track_error("scan_image", type(e).__name__, source="cli")
+        except Exception:
+            pass
         typer.echo(f"Error: {str(e)}", err=True)
         raise typer.Exit(code=1)
 
@@ -83,8 +101,27 @@ def scan_text(
     try:
         results = text_client.run_text_pipeline_sync(str_list=str_list)
         typer.echo(f"Text Pipeline Results: {results}")
+
+        try:
+            from .telemetry import track_function_call
+
+            track_function_call(
+                function_name="scan_text",
+                module="datafog.client",
+                source="cli",
+                batch_size=len(str_list),
+                operations=[op.value for op in operation_list],
+            )
+        except Exception:
+            pass
     except Exception as e:
         logging.exception("Text pipeline error")
+        try:
+            from .telemetry import track_error
+
+            track_error("scan_text", type(e).__name__, source="cli")
+        except Exception:
+            pass
         typer.echo(f"Error: {str(e)}", err=True)
         raise typer.Exit(code=1)
 
@@ -245,6 +282,18 @@ def redact_text(text: str = typer.Argument(None, help="Text to redact")):
     result = anonymizer.anonymize(text, annotations)
     typer.echo(result.anonymized_text)
 
+    try:
+        from .telemetry import track_function_call
+
+        track_function_call(
+            function_name="redact_text",
+            module="datafog.client",
+            source="cli",
+            method="redact",
+        )
+    except Exception:
+        pass
+
 
 @app.command()
 def replace_text(text: str = typer.Argument(None, help="Text to replace PII")):
@@ -266,6 +315,18 @@ def replace_text(text: str = typer.Argument(None, help="Text to replace PII")):
     result = anonymizer.anonymize(text, annotations)
     typer.echo(result.anonymized_text)
 
+    try:
+        from .telemetry import track_function_call
+
+        track_function_call(
+            function_name="replace_text",
+            module="datafog.client",
+            source="cli",
+            method="replace",
+        )
+    except Exception:
+        pass
+
 
 @app.command()
 def hash_text(
@@ -291,6 +352,19 @@ def hash_text(
     result = anonymizer.anonymize(text, annotations)
     typer.echo(result.anonymized_text)
 
+    try:
+        from .telemetry import track_function_call
+
+        track_function_call(
+            function_name="hash_text",
+            module="datafog.client",
+            source="cli",
+            method="hash",
+            hash_type=hash_type.value,
+        )
+    except Exception:
+        pass
+
 
 if __name__ == "__main__":
     app()