Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
fc058ed
Added Code Understanding sub agent
tmihalac May 6, 2026
6ea3704
Removed java tests added by mistake
tmihalac May 6, 2026
cc04b4a
Fix to prevent tools being called multiple time with the same input
tmihalac May 6, 2026
8603f64
Lowered the cpu requests for the confusion matrix
tmihalac May 7, 2026
d34f61a
Changed the cpu requests for the confusion matrix
tmihalac May 7, 2026
b18ff7b
Fixed GraphRecursionError error keeps happening
tmihalac May 10, 2026
a777ce8
Fixed Package filter prompt overflow
tmihalac May 10, 2026
cbbbe6b
Fixed forced_finish_node token over limit size
tmihalac May 11, 2026
e166133
Added more tests following review
tmihalac May 11, 2026
51e6f86
Added a note to _LoggingEmbeddingProxy following review
tmihalac May 11, 2026
21d54c4
Added more examples to dispatcher following review
tmihalac May 11, 2026
f6f305d
Fixed IMPORT_USAGE_ANALYZER tool availability following review
tmihalac May 11, 2026
bef2863
Renamed the reachability prompts following review
tmihalac May 11, 2026
52d35bf
Renamed build_system_prompt func following review
tmihalac May 11, 2026
d1e8a36
Add warning that config files bigger than 500k are skipped, following…
tmihalac May 11, 2026
12f77d0
Add concurrency handling to configuration_scanner following review
tmihalac May 11, 2026
1d13cb1
Add concurrency handling to configuration_scanner following review
tmihalac May 11, 2026
9ee97d2
Add concurrency handling to full_text_search following review
tmihalac May 11, 2026
d1c9af5
Added license info following review
tmihalac May 11, 2026
ad68555
Added license info and removed unused imports following review
tmihalac May 11, 2026
4d8cdcf
Added lock to move_to_end call, following review
tmihalac May 12, 2026
929487b
Balanced dispatcher examples, following review
tmihalac May 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .tekton/on-cm-runner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ spec:
resources:
requests:
memory: "1Gi"
cpu: "500m"
cpu: "300m"
readinessProbe:
tcpSocket:
port: 9092
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ test: test-unit test-llm-metrics ## Run all tests.
@echo "All tests have been run."

test-unit: ## Run unit tests.
@echo "Running unit tests in $(SRC_DIR)..."
@python -m pytest $(SRC_DIR) $(PYTEST_OPTS)
@echo "Running unit tests in $(SRC_DIR) and tests/..."
@python -m pytest $(SRC_DIR) tests/ $(PYTEST_OPTS)

test-llm-metrics: ## Run LLM metrics tests.
@echo "Running LLM metrics tests..."
Expand Down
33 changes: 31 additions & 2 deletions src/exploit_iq_commons/utils/dep_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,10 @@ class DependencyTreeBuilder(ABC):
supported ecosystem.
"""

# Directory name where dependency source files are stored (e.g. "vendor", "node_modules").
# Each subclass sets this to its ecosystem's convention.
DEP_SOURCE_DIR: str = ""

@abstractmethod
# Build a sort of "upside down" tree - a dict containing mapping of each
# package to a list of all consuming packages
Expand All @@ -157,6 +161,8 @@ def install_dependencies(self, manifest_path: Path):


class CCppDependencyTreeBuilder(DependencyTreeBuilder):
DEP_SOURCE_DIR = C_DEP_LIBS_NAME

# Pre-compiled regex patterns (optimization: compile once, use many times)
INCLUDE_COMBINED_RE = re.compile(
r'#include\s*([<"])([^>"]+)[>"]'
Expand All @@ -181,7 +187,7 @@ def __init__(self):
"bench", "benchmark", "demo", "sample"
]
self.C_STANDARD_LIB = "glibc"
self.RPM_LIBS_DIR = C_DEP_LIBS_NAME
self.RPM_LIBS_DIR = self.DEP_SOURCE_DIR
self.output_json_path = None
self.ccp_dep_tree = None

Expand Down Expand Up @@ -788,6 +794,7 @@ def find_project_name(self, root_dir="."):


class GoDependencyTreeBuilder(DependencyTreeBuilder):
DEP_SOURCE_DIR = "vendor"

def install_dependencies(self, manifest_path: Path):
self.download_go_mod_vendor(manifest_path)
Expand Down Expand Up @@ -897,6 +904,7 @@ def extract_package_name(self, package_name: str) -> str:
return package_name

class JavaDependencyTreeBuilder(DependencyTreeBuilder):
DEP_SOURCE_DIR = "dependencies-sources"

def __init__(self, query: str):
self._query = query
Expand All @@ -911,7 +919,7 @@ def __check_file_exists(self, dir_path: str | Path, filename: str) -> bool:
def install_dependencies(self, manifest_path: Path):
mvn_command = "mvn"
settings_path = os.getenv('JAVA_MAVEN_DEFAULT_SETTINGS_FILE_PATH','../../../../kustomize/base/settings.xml')
source_path = "dependencies-sources"
source_path = self.DEP_SOURCE_DIR

if self.__check_file_exists(manifest_path, "mvnw"):
mvn_command = "./mvnw"
Expand Down Expand Up @@ -1168,6 +1176,7 @@ def looks_like_version(v: str) -> bool:
return depth, coord

class PythonDependencyTreeBuilder(DependencyTreeBuilder):
DEP_SOURCE_DIR = TRANSITIVE_ENV_NAME

def build_tree(self, manifest_path: Path) -> defaultdict[Any, list]:
venv_python = f'{manifest_path}/{TRANSITIVE_ENV_NAME}/bin/python'
Expand Down Expand Up @@ -1565,6 +1574,7 @@ def install_dependency(self, dependency, repo_path):
logger.warning('Failed to install dependency %s', dependency)

class JavaScriptDependencyTreeBuilder(DependencyTreeBuilder):
DEP_SOURCE_DIR = "node_modules"

def build_tree(self, manifest_path: Path) -> dict[str, list[str]]:

Expand Down Expand Up @@ -1639,6 +1649,25 @@ def get_dependency_tree_builder(programming_language: Ecosystem, query: str = ""
)


# Maps each ecosystem to its builder class — used to build ECOSYSTEM_DEP_DIRS
# from class-level DEP_SOURCE_DIR without instantiating builders.
_ECOSYSTEM_BUILDER_MAP: dict[Ecosystem, type[DependencyTreeBuilder]] = {
Ecosystem.C_CPP: CCppDependencyTreeBuilder,
Ecosystem.GO: GoDependencyTreeBuilder,
Ecosystem.JAVA: JavaDependencyTreeBuilder,
Ecosystem.PYTHON: PythonDependencyTreeBuilder,
Ecosystem.JAVASCRIPT: JavaScriptDependencyTreeBuilder,
}

# Dynamic mapping of ecosystem → dependency source directory prefix.
# Built from each builder's DEP_SOURCE_DIR class attribute.
ECOSYSTEM_DEP_DIRS: dict[str, str] = {
eco.value: cls.DEP_SOURCE_DIR + "/"
for eco, cls in _ECOSYSTEM_BUILDER_MAP.items()
if cls.DEP_SOURCE_DIR
}


class DependencyTree:
"""
A class that represents a dependency tree to access an appropriate
Expand Down
64 changes: 59 additions & 5 deletions src/exploit_iq_commons/utils/document_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,52 @@ def lazy_parse(self, blob: Blob) -> typing.Iterator[Document]:
)


class _LoggingEmbeddingProxy:
Comment thread
zvigrinberg marked this conversation as resolved.
"""Wraps an Embeddings instance to log per-batch progress during VDB creation.

FAISS calls embed_documents once with all texts; the NIM SDK loops
internally in batches of max_batch_size calling _embed per batch.
This proxy intercepts embed_documents and does the batching itself
so it can log progress between batches.

NOTE: Tightly coupled with langchain_nvidia_ai_endpoints.NVIDIAEmbeddings.
Calls the private _embed(texts, model_type="passage") method directly.
Other Embeddings implementations (langchain ABC) don't expose _embed,
so this proxy will break if the embedding type changes or if NVIDIAEmbeddings
renames/removes _embed in a future version.
"""

def __init__(self, embedding, total_chunks: int, start_time: float):
self._embedding = embedding
self._total_chunks = total_chunks
self._start_time = start_time
self._embedded = 0

def embed_documents(self, texts):
batch_size = getattr(self._embedding, "max_batch_size", 128)
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
batch_start = time.time()
all_embeddings.extend(self._embedding._embed(batch, model_type="passage"))
self._embedded += len(batch)
elapsed = time.time() - self._start_time
rate = self._embedded / elapsed if elapsed > 0 else 0
remaining_min = ((self._total_chunks - self._embedded) / rate / 60) if rate > 0 else 0
logger.info("Embedding progress: %d / %d chunks (%.1f%%) - batch took %.2fs - ETA %.1f min",
self._embedded, self._total_chunks,
self._embedded / self._total_chunks * 100,
time.time() - batch_start,
remaining_min)
return all_embeddings

def embed_query(self, text):
return self._embedding.embed_query(text)

def __getattr__(self, name):
return getattr(self._embedding, name)


class DocumentEmbedding:
"""
A class to create a FAISS database from a list of source documents. The source documents are collected from git
Expand Down Expand Up @@ -374,8 +420,10 @@ def collect_documents(self, source_info: SourceDocumentsInfo) -> list[Document]:
"""

repo_path = self.get_repo_path(source_info)
cache_name = source_info.type if source_info.type != "code" else ""
documents, documents_were_in_cache = retrieve_from_cache(self._pickle_cache_directory,
source_info.git_repo, source_info.ref)
source_info.git_repo, source_info.ref,
documents_name=cache_name)
if documents_were_in_cache or len(documents) > 0:
return documents

Expand All @@ -387,7 +435,8 @@ def collect_documents(self, source_info: SourceDocumentsInfo) -> list[Document]:
with repo_lock:
# Re-check cache — another thread may have populated it while we waited.
documents, documents_were_in_cache = retrieve_from_cache(self._pickle_cache_directory,
source_info.git_repo, source_info.ref)
source_info.git_repo, source_info.ref,
documents_name=cache_name)
if documents_were_in_cache or len(documents) > 0:
return documents

Expand All @@ -403,7 +452,8 @@ def collect_documents(self, source_info: SourceDocumentsInfo) -> list[Document]:
documents = loader.load()

logger.info("Collected documents for '%s', Document count: %d", repo_path, len(documents))
save_to_cache(self._pickle_cache_directory, source_info.git_repo, source_info.ref, documents)
save_to_cache(self._pickle_cache_directory, source_info.git_repo, source_info.ref, documents,
documents_name=cache_name)
return documents

def create_vdb(self, source_infos: list[SourceDocumentsInfo], output_path: PathLike):
Expand Down Expand Up @@ -465,8 +515,12 @@ def create_vdb(self, source_infos: list[SourceDocumentsInfo], output_path: PathL

embedding_start_time = time.time()

# Wrap embedding in a proxy that logs batch progress
total_chunks = len(chunked_documents)
logging_embedding = _LoggingEmbeddingProxy(self._embedding, total_chunks, embedding_start_time)

# Create the FAISS database
db = FAISS.from_documents(chunked_documents, self._embedding)
db = FAISS.from_documents(chunked_documents, logging_embedding)

logger.info("Completed embedding in %.2f seconds for '%s'", time.time() - embedding_start_time, output_path)

Expand Down Expand Up @@ -513,7 +567,7 @@ def build_vdbs(self,
# Create embeddings for each source type
for source_type in ["code", "doc"]:

if ignore_code_embedding:
if ignore_code_embedding and source_type == "code":
continue

# Filter the source documents
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -763,3 +763,9 @@ def is_call_allowed(self, pkg_docs: list[Document], caller_function: Document, c
return False

return True

def get_import_search_patterns(self, package_name: str) -> list[re.Pattern]:
escaped = re.escape(package_name)
return [
re.compile(rf'#include\s*[<"]({escaped}[^>"]*)[>"]', re.IGNORECASE | re.MULTILINE),
]
Original file line number Diff line number Diff line change
Expand Up @@ -598,4 +598,11 @@ def is_package_imported(self, code_content: str, identifier: str, callee_package
package_name = import_package_line.split(r"\s")[1]
if package_name.strip().lower() == callee_package.strip().lower():
return True
return False
return False

def get_import_search_patterns(self, package_name: str) -> list[re.Pattern]:
escaped = re.escape(package_name)
return [
re.compile(rf'import\s+"({escaped}[^"]*)"', re.IGNORECASE | re.MULTILINE),
re.compile(rf'import\s+\(\s*[^)]*"({escaped}[^"]*)"', re.IGNORECASE | re.MULTILINE),
]
Loading
Loading