Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
[run]
source = datafog
omit =
omit =
*/tests/*
*/test_*
*/__pycache__/*
*/venv/*
*/env/*
setup.py
datafog/__init___lean.py
datafog/__init___original.py
datafog/main_lean.py
datafog/main_original.py
datafog/services/text_service_lean.py
datafog/services/text_service_original.py

[report]
exclude_lines =
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,13 @@ jobs:
sudo apt-get update
sudo apt-get install -y tesseract-ocr libtesseract-dev

- name: Install minimal dependencies to prevent segfault
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e ".[dev]"
pip install -e ".[dev]"
pip install -r requirements-dev.txt
# Add only safe extras that don't include heavy ML dependencies
pip install -e ".[cli]"
pip install -e ".[nlp,cli]"
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz

- name: Run test suite (ignore segfault during cleanup)
run: |
Expand Down Expand Up @@ -86,9 +86,9 @@ jobs:
exit(1)
"

- name: Run coverage on core modules only
- name: Run coverage
run: |
python -m pytest tests/test_text_service.py tests/test_regex_annotator.py tests/test_anonymizer.py --cov=datafog --cov-report=xml --cov-config=.coveragerc
python -m pytest tests/ -v --ignore=tests/test_gliner_annotator.py --cov=datafog --cov-report=xml --cov-config=.coveragerc

- name: Upload coverage
uses: codecov/codecov-action@v4
Expand Down
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,29 @@ async def redact_pii_middleware(request, call_next):

---

## Privacy & Telemetry

DataFog collects **anonymous** usage telemetry to help us understand which features are used and prioritize development. This data contains:

- Function and engine usage (e.g., "regex" vs "gliner")
- Coarse performance buckets (e.g., "10-100ms"), never exact timings
- Error class names only (e.g., "ImportError"), never error messages or stack traces
- A one-way hashed machine identifier — no IP addresses, usernames, or file paths

**No text content, PII, or personally identifiable information is ever collected.**

To opt out, set either environment variable before running DataFog:

```bash
export DATAFOG_NO_TELEMETRY=1
# or
export DO_NOT_TRACK=1
```

Telemetry uses only Python's standard library (`urllib.request`) — no additional dependencies are installed. All sends are fire-and-forget in background threads and will never affect performance or raise exceptions.

---

## Common Use Cases

### Enterprise
Expand Down
45 changes: 45 additions & 0 deletions datafog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,11 @@ def detect(text: str) -> list:
>>> detect("Contact john@example.com")
[{'type': 'EMAIL', 'value': 'john@example.com', 'start': 8, 'end': 24}]
"""
import time as _time

_start = _time.monotonic()

_lazy_import_regex_annotator()
annotator = RegexAnnotator()
# Use the structured output to get proper positions
_, result = annotator.annotate_with_spans(text)
Expand All @@ -166,6 +171,27 @@ def detect(text: str) -> list:
}
)

try:
from .telemetry import (
_get_duration_bucket,
_get_text_length_bucket,
track_function_call,
)

_duration = (_time.monotonic() - _start) * 1000
entity_types = list({e["type"] for e in entities})
track_function_call(
function_name="detect",
module="datafog",
engine="regex",
text_length_bucket=_get_text_length_bucket(len(text)),
entity_count=len(entities),
entity_types_found=entity_types,
duration_ms_bucket=_get_duration_bucket(_duration),
)
except Exception:
pass

return entities


Expand All @@ -190,6 +216,10 @@ def process(text: str, anonymize: bool = False, method: str = "redact") -> dict:
'findings': [{'type': 'EMAIL', 'value': 'john@example.com', ...}]
}
"""
import time as _time

_start = _time.monotonic()

findings = detect(text)

result = {"original": text, "findings": findings}
Expand All @@ -216,6 +246,21 @@ def process(text: str, anonymize: bool = False, method: str = "redact") -> dict:

result["anonymized"] = anonymized

try:
from .telemetry import _get_duration_bucket, track_function_call

_duration = (_time.monotonic() - _start) * 1000
track_function_call(
function_name="process",
module="datafog",
anonymize=anonymize,
method=method,
entity_count=len(findings),
duration_ms_bucket=_get_duration_bucket(_duration),
)
except Exception:
pass

return result


Expand Down
74 changes: 74 additions & 0 deletions datafog/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,26 @@ def scan_image(
try:
results = asyncio.run(ocr_client.run_ocr_pipeline(image_urls=image_urls))
typer.echo(f"OCR Pipeline Results: {results}")

try:
from .telemetry import track_function_call

track_function_call(
function_name="scan_image",
module="datafog.client",
source="cli",
batch_size=len(image_urls),
)
except Exception:
pass
except Exception as e:
logging.exception("Error in run_ocr_pipeline")
try:
from .telemetry import track_error

track_error("scan_image", type(e).__name__, source="cli")
except Exception:
pass
typer.echo(f"Error: {str(e)}", err=True)
raise typer.Exit(code=1)

Expand Down Expand Up @@ -83,8 +101,27 @@ def scan_text(
try:
results = text_client.run_text_pipeline_sync(str_list=str_list)
typer.echo(f"Text Pipeline Results: {results}")

try:
from .telemetry import track_function_call

track_function_call(
function_name="scan_text",
module="datafog.client",
source="cli",
batch_size=len(str_list),
operations=[op.value for op in operation_list],
)
except Exception:
pass
except Exception as e:
logging.exception("Text pipeline error")
try:
from .telemetry import track_error

track_error("scan_text", type(e).__name__, source="cli")
except Exception:
pass
typer.echo(f"Error: {str(e)}", err=True)
raise typer.Exit(code=1)

Expand Down Expand Up @@ -245,6 +282,18 @@ def redact_text(text: str = typer.Argument(None, help="Text to redact")):
result = anonymizer.anonymize(text, annotations)
typer.echo(result.anonymized_text)

try:
from .telemetry import track_function_call

track_function_call(
function_name="redact_text",
module="datafog.client",
source="cli",
method="redact",
)
except Exception:
pass


@app.command()
def replace_text(text: str = typer.Argument(None, help="Text to replace PII")):
Expand All @@ -266,6 +315,18 @@ def replace_text(text: str = typer.Argument(None, help="Text to replace PII")):
result = anonymizer.anonymize(text, annotations)
typer.echo(result.anonymized_text)

try:
from .telemetry import track_function_call

track_function_call(
function_name="replace_text",
module="datafog.client",
source="cli",
method="replace",
)
except Exception:
pass


@app.command()
def hash_text(
Expand All @@ -291,6 +352,19 @@ def hash_text(
result = anonymizer.anonymize(text, annotations)
typer.echo(result.anonymized_text)

try:
from .telemetry import track_function_call

track_function_call(
function_name="hash_text",
module="datafog.client",
source="cli",
method="hash",
hash_type=hash_type.value,
)
except Exception:
pass


if __name__ == "__main__":
app()
Loading