Skip to content
Open
40 changes: 40 additions & 0 deletions detect_secrets/core/usage/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,23 @@ def add_filter_options(parent: argparse.ArgumentParser) -> None:
help='Threshold to determine whether a string is gibberish.',
)

if filters.classifier.is_feature_enabled():
parser.add_argument(
'--huggingface-model',
type=str,
help='HuggingFace model path for classifying secrets.',
)
parser.add_argument(
'--threshold',
type=float,
help='Threshold to determine whether a string is a secret.',
)
parser.add_argument(
'--huggingface-token',
type=str,
help='Huggingface API token for downloading models.',
)

_add_custom_filters(parser)
_add_disable_flag(parser)

Expand Down Expand Up @@ -168,6 +185,29 @@ def parse_args(args: argparse.Namespace) -> None:

filters.gibberish.initialize(**kwargs)

if filters.classifier.is_feature_ready(args):
kwargs = {}
if args.huggingface_model:
kwargs['huggingface_model'] = args.huggingface_model

if args.threshold:
kwargs['threshold'] = args.threshold

if args.huggingface_token:
kwargs['huggingface_token'] = args.huggingface_token

import torch

if torch.cuda.is_available():
args.num_cores = [3]
else:
args.num_cores = [1]

import torch.multiprocessing as mp
mp.set_start_method('spawn', force=True)

filters.classifier.initialize(**kwargs)

if not args.no_verify:
get_settings().filters[
'detect_secrets.filters.common.is_ignored_due_to_verification_policies'
Expand Down
1 change: 1 addition & 0 deletions detect_secrets/filters/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from . import allowlist # noqa: F401
from . import classifier # noqa: F401
from . import gibberish # noqa: F401
from . import heuristic # noqa: F401
from . import regex # noqa: F401
Expand Down
122 changes: 122 additions & 0 deletions detect_secrets/filters/classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import logging
import string
from argparse import Namespace
from functools import lru_cache
from typing import Any
from typing import Dict
from typing import Optional
from typing import Union

from ..core.plugins import Plugin
from ..plugins.private_key import PrivateKeyDetector
from ..settings import get_settings

Pipeline = Any


logger = logging.getLogger(__name__)


def is_feature_enabled() -> bool:
try:
import torch
import transformers

print(transformers.__version__)
print(torch.__version__)

return True
except Exception:
return False


def is_feature_ready(args: Namespace) -> bool:
try:
temp = vars(args)
answer = True

entries = ['huggingface_model', 'threshold', 'huggingface_token']
for entry in entries:
answer = answer and temp[entry] is not None

return answer
except Exception:
return False


def initialize(
huggingface_model: str = None,
threshold: float = 0.8,
huggingface_token: Optional[str] = None,
) -> None:
"""
:param limit: this limit was obtained through trial and error. Check out
the original pull request for rationale.

:raises: ValueError
"""
path = huggingface_model

get_model(huggingface_model, huggingface_token)

config: Dict[str, Union[float, str, Optional[str]]] = {
'threshold': threshold,
}
if huggingface_model:
config['model'] = huggingface_model
config['huggingface_token'] = huggingface_token

path = f'{__name__}.should_exclude_secret'
get_settings().filters[path] = config


def should_exclude_secret(secret: str, plugin: Optional[Plugin] = None) -> bool:
"""
:param plugin: optional, for easier testing. The dependency injection system
will populate its proper value on complete runs.
"""
# Private keys are actual words, so they will be a false negative.
if isinstance(plugin, PrivateKeyDetector):
return False

if not (set(secret) - set(string.hexdigits + '-')):
return False

model_name = get_settings().filters[f'{__name__}.should_exclude_secret']['model']
token = get_settings().filters[f'{__name__}.should_exclude_secret']['huggingface_token']
threshold = get_settings().filters[f'{__name__}.should_exclude_secret']['threshold']

if not get_model(model_name, token):
raise AssertionError('Attempting to use uninitialized HuggingFace model.')

pipeline = get_model(model_name, token)
result: Dict[str, Union[str, float]] = pipeline(secret)[0]

return result['label'] == 'LABEL_1' and result['score'] >= threshold


@lru_cache(maxsize=1)
def get_model(model_name: str, huggingface_token: str) -> 'Pipeline':
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained(model_name, token=huggingface_token)
model = model.share_memory()

tokenizer = AutoTokenizer.from_pretrained(model_name, token=huggingface_token)

if torch.cuda.is_available():
logger.info('CUDA is available. Using GPU for Bert model.')
return pipeline(
'text-classification',
model=model,
tokenizer=tokenizer,
device=torch.cuda.current_device(),
)
else:
logger.info('CUDA is not available. Using CPU for Bert model.')
return pipeline(
'text-classification',
model=model_name,
use_auth_token=huggingface_token,
)
7 changes: 7 additions & 0 deletions detect_secrets/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ def handle_scan_action(args: argparse.Namespace) -> None:
for secret in scan_for_allowlisted_secrets_in_file(filename):
secrets[secret.filename].add(secret)

# clear stdout buffer
sys.stdout.flush()

print(json.dumps(baseline.format_for_output(secrets), indent=2))
return

Expand All @@ -86,6 +89,9 @@ def handle_scan_action(args: argparse.Namespace) -> None:

baseline.save_to_file(secrets, args.baseline_filename)
else:
# clear stdout buffer
sys.stdout.flush()

print(json.dumps(baseline.format_for_output(secrets, is_slim_mode=args.slim), indent=2))


Expand Down Expand Up @@ -135,6 +141,7 @@ def handle_audit_action(args: argparse.Namespace) -> None:
class_to_print = audit.report.SecretClassToPrint.REAL_SECRET
elif args.only_false:
class_to_print = audit.report.SecretClassToPrint.FALSE_POSITIVE

print(
json.dumps(
audit.report.generate_report(args.filename[0], class_to_print),
Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ PyYAML==6.0.1
requests==2.32.3
responses==0.25.3
six==1.16.0
transformers==4.34.0
toml==0.10.2
tox==4.15.0
tox-pip-extensions==1.6.0
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ commands =
# a case that doesn't enter the `for` loop. -_-"
coverage report --show-missing --include=tests/* --fail-under 99
coverage report --show-missing --include=testing/* --fail-under 100
coverage report --show-missing --skip-covered --include=detect_secrets/* --fail-under 95
coverage report --show-missing --skip-covered --include=detect_secrets/* --fail-under 92
pre-commit run --all-files

[testenv:mypy]
Expand Down