Comcast · ana-ai-sde · Aug 20, 2024
diff --git a/xgitguard/github-enterprise/enterprise_cred_detections.py b/xgitguard/github-enterprise/enterprise_cred_detections.py
@@ -614,21 +614,11 @@ def run_detection(
     search_query_list = format_search_query_list(configs.secondary_keywords)
     if search_query_list:
         if ml_prediction:
-            # Train Model if not present Already
-            model_file = os.path.join(
-                configs.output_dir, "xgg_cred_rf_model_object.pickle"
-            )
-            if os.path.exists(model_file):
-                logger.info(
-                    f"Detection process will use Already persisted Trained Model present in: {model_file}"
-                )
-            else:
-                logger.info(
-                    f"No persisted Trained Model present. So training and persisting a model now"
-                )
-                xgg_train_model(
-                    training_data_file="cred_train.csv", model_name="xgg_cred_rf_"
-                )
+            # Load BERT model and tokenizer
+            bert_model_path = "path_to_bert_model/bert_secret_detection_model"
+            tokenizer = BertTokenizer.from_pretrained(bert_model_path)
+            model = BertForSequenceClassification.from_pretrained(bert_model_path)
+            model.eval()  # Set model to evaluation mode
     else:
         logger.info(f"No Search query to process. Ending.")
         sys.exit(1)

diff --git a/xgitguard/github-enterprise/enterprise_key_detections.py b/xgitguard/github-enterprise/enterprise_key_detections.py
@@ -593,21 +593,11 @@ def run_detection(
     search_query_list = format_search_query_list(configs.secondary_keywords)
     if search_query_list:
         if ml_prediction:
-            # Train Model if not present Already
-            model_file = os.path.join(
-                configs.output_dir, "xgg_key_rf_model_object.pickle"
-            )
-            if os.path.exists(model_file):
-                logger.info(
-                    f"Detection process will use Already persisted Trained Model present in: {model_file}"
-                )
-            else:
-                logger.info(
-                    f"No persisted Trained Model present. So training and persisting a model now"
-                )
-                xgg_train_model(
-                    training_data_file="key_train.csv", model_name="xgg_key_rf_"
-                )
+            # Load BERT model and tokenizer
+            bert_model_path = "path_to_bert_model/bert_secret_detection_model"
+            tokenizer = BertTokenizer.from_pretrained(bert_model_path)
+            model = BertForSequenceClassification.from_pretrained(bert_model_path)
+            model.eval()  # Set model to evaluation mode
     else:
         logger.info(f"No Search query to process. Ending.")
         sys.exit(1)

diff --git a/xgitguard/github-public/public_cred_detections.py b/xgitguard/github-public/public_cred_detections.py
@@ -628,22 +628,11 @@ def run_detection(
     )
     if search_query_list:
         if ml_prediction:
-            # Train Model if not present Already
-            model_file = os.path.join(
-                configs.output_dir, "public_xgg_cred_rf_model_object.pickle"
-            )
-            if os.path.exists(model_file):
-                logger.info(
-                    f"Detection process will use Already persisted Trained Model present in: {model_file}"
-                )
-            else:
-                logger.info(
-                    f"No persisted Trained Model present. So training and persisting a model now"
-                )
-                xgg_train_model(
-                    training_data_file="public_cred_train.csv",
-                    model_name="public_xgg_cred_rf_",
-                )
+            # Load BERT model and tokenizer
+            bert_model_path = "path_to_bert_model/bert_secret_detection_model"
+            tokenizer = BertTokenizer.from_pretrained(bert_model_path)
+            model = BertForSequenceClassification.from_pretrained(bert_model_path)
+            model.eval()  # Set model to evaluation mode
     else:
         logger.info(f"No Search query to process. Ending.")
         sys.exit(1)

diff --git a/xgitguard/github-public/public_key_detections.py b/xgitguard/github-public/public_key_detections.py
@@ -603,22 +603,11 @@ def run_detection(
     )
     if search_query_list:
         if ml_prediction:
-            # Train Model if not present Already
-            model_file = os.path.join(
-                configs.output_dir, "public_xgg_key_rf_model_object.pickle"
-            )
-            if os.path.exists(model_file):
-                logger.info(
-                    f"Detection process will use Already persisted Trained Model present in: {model_file}"
-                )
-            else:
-                logger.info(
-                    f"No persisted Trained Model present. So training and persisting a model now"
-                )
-                xgg_train_model(
-                    training_data_file="public_key_train.csv",
-                    model_name="public_xgg_key_rf_",
-                )
+            # Load BERT model and tokenizer
+            bert_model_path = "path_to_bert_model/bert_secret_detection_model"
+            tokenizer = BertTokenizer.from_pretrained(bert_model_path)
+            model = BertForSequenceClassification.from_pretrained(bert_model_path)
+            model.eval()  # Set model to evaluation mode
     else:
         logger.info(f"No Search query to process. Ending.")
         sys.exit(1)