RHEcosystemAppEng · tmihalac · May 6, 2026 · May 6, 2026 · May 6, 2026 · May 7, 2026
diff --git a/.tekton/on-cm-runner.yaml b/.tekton/on-cm-runner.yaml
@@ -269,7 +269,7 @@ spec:
               resources:
                 requests:
                   memory: "1Gi"
-                  cpu: "500m"
+                  cpu: "300m"
               readinessProbe:
                 tcpSocket:
                   port: 9092

diff --git a/Makefile b/Makefile
@@ -71,8 +71,8 @@ test: test-unit test-llm-metrics ## Run all tests.
 	@echo "All tests have been run."
 
 test-unit: ## Run unit tests.
-	@echo "Running unit tests in $(SRC_DIR)..."
-	@python -m pytest $(SRC_DIR) $(PYTEST_OPTS)
+	@echo "Running unit tests in $(SRC_DIR) and tests/..."
+	@python -m pytest $(SRC_DIR) tests/ $(PYTEST_OPTS)
 
 test-llm-metrics: ## Run LLM metrics tests.
 	@echo "Running LLM metrics tests..."

diff --git a/src/exploit_iq_commons/utils/dep_tree.py b/src/exploit_iq_commons/utils/dep_tree.py
@@ -142,6 +142,10 @@ class DependencyTreeBuilder(ABC):
     supported ecosystem.
     """
 
+    # Directory name where dependency source files are stored (e.g. "vendor", "node_modules").
+    # Each subclass sets this to its ecosystem's convention.
+    DEP_SOURCE_DIR: str = ""
+
     @abstractmethod
     # Build a sort of "upside down" tree - a dict containing mapping of each
     # package to a list of all consuming packages
@@ -157,6 +161,8 @@ def install_dependencies(self, manifest_path: Path):
 
 
 class CCppDependencyTreeBuilder(DependencyTreeBuilder):
+    DEP_SOURCE_DIR = C_DEP_LIBS_NAME
+
     # Pre-compiled regex patterns (optimization: compile once, use many times)
     INCLUDE_COMBINED_RE = re.compile(
         r'#include\s*([<"])([^>"]+)[>"]'
@@ -181,7 +187,7 @@ def __init__(self):
             "bench", "benchmark", "demo", "sample"
         ]
         self.C_STANDARD_LIB = "glibc"
-        self.RPM_LIBS_DIR = C_DEP_LIBS_NAME
+        self.RPM_LIBS_DIR = self.DEP_SOURCE_DIR
         self.output_json_path = None
         self.ccp_dep_tree = None
 
@@ -788,6 +794,7 @@ def find_project_name(self, root_dir="."):
 
 
 class GoDependencyTreeBuilder(DependencyTreeBuilder):
+    DEP_SOURCE_DIR = "vendor"
 
     def install_dependencies(self, manifest_path: Path):
         self.download_go_mod_vendor(manifest_path)
@@ -897,6 +904,7 @@ def extract_package_name(self, package_name: str) -> str:
             return package_name
 
 class JavaDependencyTreeBuilder(DependencyTreeBuilder):
+    DEP_SOURCE_DIR = "dependencies-sources"
 
     def __init__(self, query: str):
         self._query = query
@@ -911,7 +919,7 @@ def __check_file_exists(self, dir_path: str | Path, filename: str) -> bool:
     def install_dependencies(self, manifest_path: Path):
         mvn_command = "mvn"
         settings_path = os.getenv('JAVA_MAVEN_DEFAULT_SETTINGS_FILE_PATH','../../../../kustomize/base/settings.xml')
-        source_path = "dependencies-sources"
+        source_path = self.DEP_SOURCE_DIR
 
         if self.__check_file_exists(manifest_path, "mvnw"):
             mvn_command = "./mvnw"
@@ -1168,6 +1176,7 @@ def looks_like_version(v: str) -> bool:
         return depth, coord
 
 class PythonDependencyTreeBuilder(DependencyTreeBuilder):
+    DEP_SOURCE_DIR = TRANSITIVE_ENV_NAME
 
     def build_tree(self, manifest_path: Path) -> defaultdict[Any, list]:
         venv_python = f'{manifest_path}/{TRANSITIVE_ENV_NAME}/bin/python'
@@ -1565,6 +1574,7 @@ def install_dependency(self, dependency, repo_path):
             logger.warning('Failed to install dependency %s', dependency)
 
 class JavaScriptDependencyTreeBuilder(DependencyTreeBuilder):
+    DEP_SOURCE_DIR = "node_modules"
 
     def build_tree(self, manifest_path: Path) -> dict[str, list[str]]:
 
@@ -1639,6 +1649,25 @@ def get_dependency_tree_builder(programming_language: Ecosystem, query: str = ""
         )
 
 
+# Maps each ecosystem to its builder class — used to build ECOSYSTEM_DEP_DIRS
+# from class-level DEP_SOURCE_DIR without instantiating builders.
+_ECOSYSTEM_BUILDER_MAP: dict[Ecosystem, type[DependencyTreeBuilder]] = {
+    Ecosystem.C_CPP: CCppDependencyTreeBuilder,
+    Ecosystem.GO: GoDependencyTreeBuilder,
+    Ecosystem.JAVA: JavaDependencyTreeBuilder,
+    Ecosystem.PYTHON: PythonDependencyTreeBuilder,
+    Ecosystem.JAVASCRIPT: JavaScriptDependencyTreeBuilder,
+}
+
+# Dynamic mapping of ecosystem → dependency source directory prefix.
+# Built from each builder's DEP_SOURCE_DIR class attribute.
+ECOSYSTEM_DEP_DIRS: dict[str, str] = {
+    eco.value: cls.DEP_SOURCE_DIR + "/"
+    for eco, cls in _ECOSYSTEM_BUILDER_MAP.items()
+    if cls.DEP_SOURCE_DIR
+}
+
+
 class DependencyTree:
     """
     A class that represents a dependency tree to access an appropriate

diff --git a/src/exploit_iq_commons/utils/document_embedding.py b/src/exploit_iq_commons/utils/document_embedding.py
@@ -217,6 +217,52 @@ def lazy_parse(self, blob: Blob) -> typing.Iterator[Document]:
             )
 
 
+class _LoggingEmbeddingProxy:
+    """Wraps an Embeddings instance to log per-batch progress during VDB creation.
+
+    FAISS calls embed_documents once with all texts; the NIM SDK loops
+    internally in batches of max_batch_size calling _embed per batch.
+    This proxy intercepts embed_documents and does the batching itself
+    so it can log progress between batches.
+
+    NOTE: Tightly coupled with langchain_nvidia_ai_endpoints.NVIDIAEmbeddings.
+    Calls the private _embed(texts, model_type="passage") method directly.
+    Other Embeddings implementations (langchain ABC) don't expose _embed,
+    so this proxy will break if the embedding type changes or if NVIDIAEmbeddings
+    renames/removes _embed in a future version.
+    """
+
+    def __init__(self, embedding, total_chunks: int, start_time: float):
+        self._embedding = embedding
+        self._total_chunks = total_chunks
+        self._start_time = start_time
+        self._embedded = 0
+
+    def embed_documents(self, texts):
+        batch_size = getattr(self._embedding, "max_batch_size", 128)
+        all_embeddings = []
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i + batch_size]
+            batch_start = time.time()
+            all_embeddings.extend(self._embedding._embed(batch, model_type="passage"))
+            self._embedded += len(batch)
+            elapsed = time.time() - self._start_time
+            rate = self._embedded / elapsed if elapsed > 0 else 0
+            remaining_min = ((self._total_chunks - self._embedded) / rate / 60) if rate > 0 else 0
+            logger.info("Embedding progress: %d / %d chunks (%.1f%%) - batch took %.2fs - ETA %.1f min",
+                        self._embedded, self._total_chunks,
+                        self._embedded / self._total_chunks * 100,
+                        time.time() - batch_start,
+                        remaining_min)
+        return all_embeddings
+
+    def embed_query(self, text):
+        return self._embedding.embed_query(text)
+
+    def __getattr__(self, name):
+        return getattr(self._embedding, name)
+
+
 class DocumentEmbedding:
     """
     A class to create a FAISS database from a list of source documents. The source documents are collected from git
@@ -374,8 +420,10 @@ def collect_documents(self, source_info: SourceDocumentsInfo) -> list[Document]:
         """
 
         repo_path = self.get_repo_path(source_info)
+        cache_name = source_info.type if source_info.type != "code" else ""
         documents, documents_were_in_cache = retrieve_from_cache(self._pickle_cache_directory,
-                                                                 source_info.git_repo, source_info.ref)
+                                                                 source_info.git_repo, source_info.ref,
+                                                                 documents_name=cache_name)
         if documents_were_in_cache or len(documents) > 0:
             return documents
 
@@ -387,7 +435,8 @@ def collect_documents(self, source_info: SourceDocumentsInfo) -> list[Document]:
         with repo_lock:
             # Re-check cache — another thread may have populated it while we waited.
             documents, documents_were_in_cache = retrieve_from_cache(self._pickle_cache_directory,
-                                                                     source_info.git_repo, source_info.ref)
+                                                                     source_info.git_repo, source_info.ref,
+                                                                     documents_name=cache_name)
             if documents_were_in_cache or len(documents) > 0:
                 return documents
 
@@ -403,7 +452,8 @@ def collect_documents(self, source_info: SourceDocumentsInfo) -> list[Document]:
             documents = loader.load()
 
             logger.info("Collected documents for '%s', Document count: %d", repo_path, len(documents))
-            save_to_cache(self._pickle_cache_directory, source_info.git_repo, source_info.ref, documents)
+            save_to_cache(self._pickle_cache_directory, source_info.git_repo, source_info.ref, documents,
+                          documents_name=cache_name)
             return documents
 
     def create_vdb(self, source_infos: list[SourceDocumentsInfo], output_path: PathLike):
@@ -465,8 +515,12 @@ def create_vdb(self, source_infos: list[SourceDocumentsInfo], output_path: PathL
 
         embedding_start_time = time.time()
 
+        # Wrap embedding in a proxy that logs batch progress
+        total_chunks = len(chunked_documents)
+        logging_embedding = _LoggingEmbeddingProxy(self._embedding, total_chunks, embedding_start_time)
+
         # Create the FAISS database
-        db = FAISS.from_documents(chunked_documents, self._embedding)
+        db = FAISS.from_documents(chunked_documents, logging_embedding)
 
         logger.info("Completed embedding in %.2f seconds for '%s'", time.time() - embedding_start_time, output_path)
 
@@ -513,7 +567,7 @@ def build_vdbs(self,
         # Create embeddings for each source type
         for source_type in ["code", "doc"]:
 
-            if ignore_code_embedding:
+            if ignore_code_embedding and source_type == "code":
                 continue
 
             # Filter the source documents

diff --git a/src/exploit_iq_commons/utils/functions_parsers/c_lang_function_parsers.py b/src/exploit_iq_commons/utils/functions_parsers/c_lang_function_parsers.py
@@ -763,3 +763,9 @@ def is_call_allowed(self, pkg_docs: list[Document], caller_function: Document, c
                 return False
 
         return True
+
+    def get_import_search_patterns(self, package_name: str) -> list[re.Pattern]:
+        escaped = re.escape(package_name)
+        return [
+            re.compile(rf'#include\s*[<"]({escaped}[^>"]*)[>"]', re.IGNORECASE | re.MULTILINE),
+        ]
diff --git a/src/exploit_iq_commons/utils/functions_parsers/golang_functions_parsers.py b/src/exploit_iq_commons/utils/functions_parsers/golang_functions_parsers.py
@@ -598,4 +598,11 @@ def is_package_imported(self, code_content: str, identifier: str, callee_package
                     package_name = import_package_line.split(r"\s")[1]
                     if package_name.strip().lower() == callee_package.strip().lower():
                         return True
-        return False
+        return False
+
+    def get_import_search_patterns(self, package_name: str) -> list[re.Pattern]:
+        escaped = re.escape(package_name)
+        return [
+            re.compile(rf'import\s+"({escaped}[^"]*)"', re.IGNORECASE | re.MULTILINE),
+            re.compile(rf'import\s+\(\s*[^)]*"({escaped}[^"]*)"', re.IGNORECASE | re.MULTILINE),
+        ]