From aa8e9ffd7fb4b33845a0725e51bc233946292625 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Fri, 17 Oct 2025 00:45:23 +0100 Subject: [PATCH 01/26] update(dependent miner): add java maven miner --- security_pattern_miner/src/config/constants.py | 4 +++- .../src/dependent_miner/java.py | 11 +++++++++++ security_pattern_miner/src/runner.py | 15 +++++++++++---- 3 files changed, 25 insertions(+), 5 deletions(-) create mode 100644 security_pattern_miner/src/dependent_miner/java.py diff --git a/security_pattern_miner/src/config/constants.py b/security_pattern_miner/src/config/constants.py index 9e8ca7f..6cb0188 100644 --- a/security_pattern_miner/src/config/constants.py +++ b/security_pattern_miner/src/config/constants.py @@ -5,4 +5,6 @@ SPRING_BOOT = "spring-boot" PYPI = "Pypi" -GITHUB = "github" \ No newline at end of file +GITHUB = "github" +MAVEN = "Maven" +SPRING_SECURITY ="org.springframework.security:spring-security-core" \ No newline at end of file diff --git a/security_pattern_miner/src/dependent_miner/java.py b/security_pattern_miner/src/dependent_miner/java.py new file mode 100644 index 0000000..6157554 --- /dev/null +++ b/security_pattern_miner/src/dependent_miner/java.py @@ -0,0 +1,11 @@ +from config.libraries_io import LibrariesIOConfig +from .base import DependentMiner, LibrariesIODependentMiner +import requests +from typing import List +from config.constants import JAVA, MAVEN +from schemas.libraries_io_response import DependentRepositoryInfo + + +class JavaDependentMiner(LibrariesIODependentMiner): + def __init__(self, config: LibrariesIOConfig): + super().__init__(package_manager=MAVEN, language=JAVA, config=config) diff --git a/security_pattern_miner/src/runner.py b/security_pattern_miner/src/runner.py index 5cc9c09..8d3dd79 100644 --- a/security_pattern_miner/src/runner.py +++ b/security_pattern_miner/src/runner.py @@ -2,16 +2,23 @@ from utils.logger import logger import logging from dependent_miner.python import PythonDependentMiner + +from dependent_miner.java import JavaDependentMiner from repo_crawler.base import GitCrawler -from config.constants import PYTHON, PYPI +from config.constants import PYTHON, PYPI, JAVA, MAVEN from config.crawler import GitCrawlerConfig from config.libraries_io import LibrariesIOConfig +dependent_miners = { + (PYTHON, PYPI): PythonDependentMiner, + (JAVA, MAVEN): JavaDependentMiner +} + class Pipeline: def __init__(self, args): - if args.language.lower() != PYTHON or args.package_manager != PYPI: - raise ValueError("Currently, only Python language with PyPI package manager is supported.") + # if args.language.lower() != PYTHON or args.package_manager != PYPI: + # raise ValueError("Currently, only Python language with PyPI package manager is supported.") self.args = args if args.max_pages: @@ -30,7 +37,7 @@ def __init__(self, args): GitCrawlerConfig.root_data_dir = args.root_data_dir GitCrawlerConfig.cloned_repos_dir = os.path.join(args.root_data_dir, "cloned_repos") - self.dependent_miner = PythonDependentMiner(LibrariesIOConfig) + self.dependent_miner = dependent_miners.get((args.language.lower(), args.package_manager), None)(LibrariesIOConfig) self.repo_crawler = GitCrawler(GitCrawlerConfig) def run(self, package_names: list[str]): From e2a7420c1a580d63433ffee7d4a89a74306180f5 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Fri, 17 Oct 2025 00:47:23 +0100 Subject: [PATCH 02/26] update(depend): zoekt as submodule --- .gitmodules | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .gitmodules diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..a519b54 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "zoekt"] + path = zoekt + url = https://github.com/minhna1112/zoekt From fbff28a10f7d4753600e0bae92b7936a26a6918e Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Fri, 17 Oct 2025 00:53:27 +0100 Subject: [PATCH 03/26] update(depend): add zoekt as a submodule --- zoekt | 1 + 1 file changed, 1 insertion(+) create mode 160000 zoekt diff --git a/zoekt b/zoekt new file mode 160000 index 0000000..29ddb55 --- /dev/null +++ b/zoekt @@ -0,0 +1 @@ +Subproject commit 29ddb55de4c794db5dc00915a231ef2079effffc From 7551da48c7c7360e52094c9f891963c96c8d5d16 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Fri, 17 Oct 2025 01:24:50 +0100 Subject: [PATCH 04/26] update(.gitignore): zoekt index volumes --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index fffe963..6e31355 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ *.pyc *.jsonl *.sif -**/*.pyc \ No newline at end of file +**/*.pyc +./build/volumes/zoekt/index-data/* \ No newline at end of file From 7427d6fbd242126d5e029becfee943e3276a004b Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Fri, 17 Oct 2025 17:04:12 +0100 Subject: [PATCH 05/26] docker compose for zoekt --- docker-compose.yml | 62 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index a1eeeea..addb2a5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,7 +8,7 @@ services: - ./build/volumes/data:/data env_file: - ./.env - command: ["python", "./runner.py"] + command: ["python", "./runner.py", "--get_dependents", "--package_names", "org.springframework.security:spring-security-core", "--language", "java", "--package_manager", "Maven", "--root_data_dir=/data"] deploy: resources: limits: @@ -17,3 +17,63 @@ services: reservations: cpus: '4.0' memory: 4G + + zoekt-webserver: + image: zoekt-local + ports: + - "6070:6070" # Default zoekt-webserver port + volumes: + - ./build/volumes/zoekt/index-data:/data/index:ro # Read-only access to index data + environment: + - DATA_DIR=/data/index + - GOGC=25 + command: zoekt-webserver -index /data/index -pprof -rpc -listen :6070 + restart: unless-stopped + healthcheck: + test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:6070/"] + interval: 30s + timeout: 10s + retries: 3 + + zoekt-indexer: + image: zoekt-local + volumes: + - ./build/volumes/zoekt/index-data:/data/index + - ./build/volumes/data/cloned_repos:/repos:ro # Mount local repos directory + working_dir: /repos + command: | + sh -c " + echo 'Starting repository indexing...' + mkdir -p /data/index + + # Check if repos directory exists and has content + if [ ! -d '/repos' ]; then + echo 'Error: /repos directory not found' + exit 1 + fi + + # List contents of repos directory + echo 'Contents of /repos:' + ls -la /repos/ + + # Find and index repositories + cd /repos + for repo in */; do + if [ -d \"$$repo\" ]; then + echo \"Found directory: $$repo\" + if [ -d \"$$repo/.git\" ]; then + echo \"Indexing Git repository: $$repo\" + zoekt-git-index -index /data/index \"/repos/$$repo\" + else + echo \"Indexing directory: $$repo\" + zoekt-index -index /data/index \"/repos/$$repo\" + fi + fi + done + + if [ ! -f /data/index/*.zoekt ]; then + echo 'No repositories found to index. Please add repositories to the ./repos directory' + else + echo 'Indexing complete!' + fi + " From 9752be5f4cc55d9e20bc041a5d759dcee4c951d3 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Fri, 17 Oct 2025 17:45:07 +0100 Subject: [PATCH 06/26] update .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6e31355..6cca1ac 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,9 @@ ./build/volumes/* +./build/volumes/zoekt/index-data/*.zoekt .env */__pycache__/* *.pyc *.jsonl *.sif **/*.pyc -./build/volumes/zoekt/index-data/* \ No newline at end of file +# \ No newline at end of file From e68a365a17e752c87f714d04cc8c83bb4af532e3 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Fri, 17 Oct 2025 17:46:04 +0100 Subject: [PATCH 07/26] refactor: update .gitignore script for better organization --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6cca1ac..cd7e666 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ ./build/volumes/* -./build/volumes/zoekt/index-data/*.zoekt +build/volumes/zoekt/index-data/*.zoekt .env */__pycache__/* *.pyc From 1247f16e90087d3b174fd8e53adf8ece29919ba2 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Sat, 8 Nov 2025 17:15:51 +0000 Subject: [PATCH 08/26] update .gitignore --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index cd7e666..5825ff8 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,7 @@ build/volumes/zoekt/index-data/*.zoekt *.jsonl *.sif **/*.pyc -# \ No newline at end of file +.DS_Store +.vscode/ +.claude +.idea/ From 004ef1e4e823d8701f567ff21548a5209c43e0d4 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Sat, 8 Nov 2025 17:16:33 +0000 Subject: [PATCH 09/26] pyc --- .../config/__pycache__/constants.cpython-311.pyc | Bin 355 -> 0 bytes .../__pycache__/libraries_io.cpython-311.pyc | Bin 611 -> 0 bytes .../__pycache__/base.cpython-311.pyc | Bin 4294 -> 0 bytes .../__pycache__/python.cpython-311.pyc | Bin 1564 -> 0 bytes .../libraries_io_request.cpython-311.pyc | Bin 805 -> 0 bytes .../libraries_io_response.cpython-311.pyc | Bin 2713 -> 0 bytes .../__pycache__/libraries_io.cpython-311.pyc | Bin 1070 -> 0 bytes .../src/utils/__pycache__/logger.cpython-311.pyc | Bin 911 -> 0 bytes 8 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 security_pattern_miner/src/config/__pycache__/constants.cpython-311.pyc delete mode 100644 security_pattern_miner/src/config/__pycache__/libraries_io.cpython-311.pyc delete mode 100644 security_pattern_miner/src/dependent_miner/__pycache__/base.cpython-311.pyc delete mode 100644 security_pattern_miner/src/dependent_miner/__pycache__/python.cpython-311.pyc delete mode 100644 security_pattern_miner/src/schemas/__pycache__/libraries_io_request.cpython-311.pyc delete mode 100644 security_pattern_miner/src/schemas/__pycache__/libraries_io_response.cpython-311.pyc delete mode 100644 security_pattern_miner/src/utils/__pycache__/libraries_io.cpython-311.pyc delete mode 100644 security_pattern_miner/src/utils/__pycache__/logger.cpython-311.pyc diff --git a/security_pattern_miner/src/config/__pycache__/constants.cpython-311.pyc b/security_pattern_miner/src/config/__pycache__/constants.cpython-311.pyc deleted file mode 100644 index f982f09da34af88d9f3d40e762f4e453c76e128b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 355 zcmZ8d!AiqG5ZyG}jkdHX9tE$`gD&+fLMo{7(1a!m%_XMUw29cwc6TepQ~ttF5dXyw zD0>q80im~^+!Vd|4l{2aZ+OGJbviAe@qj5DGI5eSF(KALkfB0 z;Km&)cx&I=%DS@s!1f%42A89;7pS)Hjhx|iG;rNOFO5Ct9jxoCHYvUnDHnL2(~QRb ze*X+F6M|Vru;588>##_q$&%-?j94sX%BjHrl}I~NzTHU*I?ZQTH+?yk;*^z}o%4A? umQU#g-_xn}L&UKz-=<+0tKIzyt*lN3SFMA}+K!*y!>Ya2Dp*0@)!rYMLu-@( diff --git a/security_pattern_miner/src/config/__pycache__/libraries_io.cpython-311.pyc b/security_pattern_miner/src/config/__pycache__/libraries_io.cpython-311.pyc deleted file mode 100644 index 29aa6a665aa533712abffb5116d78a3491ba37ae..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 611 zcmZuuy-ve05WXY@3ZjJqqC5g+z$yb1Li`9-5#>i92C`ToHf>Oo7P~>PMTmicy>9?Q zJOFQjKw_w}GO-nviHUQXG5~kF^Y`8P&i45(lNkY&$Ka`YiR+speMu#tJBy$N4475} z3v?&})31OTw>rQBT2CyEY5Pz}y(_xyB++;=64G5n&;ky)7c5-|Zk+T_!S1&j_CxE5 zHNbRh5N(JVs&AAlyGOznE|=!cdfh*EE74ThT-#gSGdK3BxkFcXO}e>p_|CAwb|D?5 zDfMlSQyPs>>eX3LQ+Slp)4;AJ8NDu}!3uA3|126x8ble-KBs=*(S}{&G8$F1!9`5( zp{wrqRV4?K3rRfJuljbeSX>~1!$_mbh!lO!cu!GA8xfLr^B*RG2SmO z=t6yUuiKbHaVm%CE4|d4&a#+5s(wJOpz9c#rnO->jJ1<8w0sBtx(^lm{3j8g@D0wM Bo9X}n diff --git a/security_pattern_miner/src/dependent_miner/__pycache__/base.cpython-311.pyc b/security_pattern_miner/src/dependent_miner/__pycache__/base.cpython-311.pyc deleted file mode 100644 index d0fdf820b4e2ca4e8288e55ff9df728f58c2b5fa..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4294 zcmb6cTWlN0aqmI$NF7PNEy<=$*_LeDiDboTTnnm^G)4rsj$GJ{3$SPpXWojUd^|FH zN5^8RLInhr0gNI69Jp-~L?5ji!})Ok`q}*Tt2hZXd^o^>fFd9MqaX(;{84mvk390^ z*l73mW@l%1X7^@iX7~OQkH-)c`8WSuG64G*?X*j%h3qbU@Q5BSCb9{JOM0@{o9$&d zulHs9c!YbG``e4b!ty{Hzsn(XA6~C;h)45Ci>@OrhPk@}@Zj}Ic98LUfG6X@J-i`} zzk)LHCv<3rcZIip__mu+ubCE6b5;>sOC@a~!?_197OoMM6tH>W(mb}#<1#ihY*?4^ z`xR_j_c(W77p+_}3s#A&Trd_(Al%m_{C3G$EaZVWn8%i)x3wyTl2Rc$(8Y@d(}Fgv zm-2Z`a;=YGP5YB}PeAt$!YIpWD9d9(LHIEfTE#e&S3m`;IO6S=P1E3pz1>NHo?os22Ce{~80_Y}{NT;6_rr$F$F{g_K zW64lwW@cWSuH>}o@{%@fk{l%$jJ#svT!j>@6{W0N7AA%{y_cfENKBZF);4X6c1<<$ zRC&csP<#1IvziKi(NvJl78!v>uAyy7u8X^XqJ>P@l@#{LisHr;rC8D`ItE-;l=myD zPK}``M1--f04_iV0V91B3{gOxkHp)6BIa2DYyRVe2AVtIrOf6bQ8JE&=#$*G9uO3mRUXTO-4w;oQU6rDM%UQ68 z2Y^mej7ezz=n&kfqU3bdG!@09JDpCa*SwugL$7-Hg+`caE2nN?z5?LiYv{2Q|1feZ z@<X&YqV5>rmK-kd?%q;f+>;hrO zt@3%aD%5xj811s`+w3~S$k#Qr;ajIYind_4prmiLulQgFTor4g$r+znZLg!!VlcY~ z*5GTpPOXQ?NvnspURkDDgB4Avai1bhtnmOtFVp@h`aB$*k3+^!Yr+PN-{;ESwQ(pg zztk4c!Y`jwi?EeRbd}G~U*k2d3W2%G@3G^a9eD6{jV9RW!V}8K2rN6gVi^L%mB1q* z<>hhpdzWzz7j9rJT`};@GR|3m8kOQTOwy&rbSJB&7fU4F3bk}Knl2g8%j0yFpG{Y# zcd(UP0?F!dK!lWvtx@rSd?re$HQabny{W)#Y%Qic9L$suFpS~KbXa@IuApOs(2aE? zAE<<;aW{-_T5#&QVu2oOAup_$Lq|K>W1Ww2`#X4h7L(p5AmY+#FXU)~tsZ}I$OYFr z0;A2>!P0BU862udouPg8n3LG|aQ`cf{j*z%SvxW7j88lopW7Orb5a>+=-}hg6OGX` z%?LXDgB>IuPCTW+mJc>#XmAoTaeUb69r`BOZx2jtCMO%o$%mQQMrM91Id3QDANLO5 zPBdY`@A3A}avsv^8Xt6bAO7ms(0!A?%AOl90D%q=X5N;EAIYgL zIb|O_wJFaujlgM$YLWAMPMKAyCS5_GqiYSmialfLt&?7s7BZrDT2S96zQLaS1G+gs{-Hy4 z(z9Tf+Vc*|qZ;}}VwLMY^E6ZdfA|ykJMLrtRuZ&x-gSb065OcxsZ*|~sg~NhP_dwR z7Ms)I4wx$y%jc@+|JT@PN7&SyUdpMuc@B`amUA$hc?`fBvPZ9a^>5n4u^D^pEYt_* zz1HV3SIJ0(Zii|m z=^Heq8)<2B1-SH5y=J*&nAlBrWejQ-GEG-bCucdMlQ)h7se0=FwGGquF^I0AZzVJ^ z;~Ypg4xDmEC!D?^XDHQ-h6Z9g2*6Vc>XC0{lsvL=W;1cJkvLfw9uMrlJ+~oj4jgR^ z9IZ=E&p@N+(8lSlo=LlB(m6Q!Me&ctKa@Tz)h}%`EC0O&7Tp}Z+8DiB|B*A0vgOpa zJoreS*peqUfaW?vuXH3$0lgm(>Tv8%>IwzJBGd?Oymgz& zVO>`g@UL;8n$%6-^iu48DntR@2=lK1;M&rDn!FGi|59ioXdalXc`=9$g~sU37McgI z+uy&`IQnp& zKgj>dcqYwx|0d_iMPSg~G=~8;1&-q!bj1Fy>Y&6Ld$v(>jXe&Uw}aI|Z`r}>pcy+@ z9dz0bRtN2~gVjNY>|k|J&l-Cim_59n{3vo;-bASeO4%p{ZpOji(DUGw8M%` UbKningN?)vr9S-*7PH%b0fB80(f|Me diff --git a/security_pattern_miner/src/dependent_miner/__pycache__/python.cpython-311.pyc b/security_pattern_miner/src/dependent_miner/__pycache__/python.cpython-311.pyc deleted file mode 100644 index 3a4e245ea1e5f974b053e9b08484552d2bc8436c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1564 zcmbVM&x;gC6t3!D)3eSxCT?&=gau=uFFR~z4gp0C1d|2b9ilndm!h}3W~Q;br>&|% zM{?N1LKM7)y?R;`Hi7&D`4^;>1&4y1atPigm_zWCSJm5t8^?oISAYHLy;oK5eXpvY zEG{k}Xy?S!!Cx4mKh>aW_Mz}<3xr)1p$I1^#Xe3A-@qs`A~P|46TVhrrM7Qtot-#o z&99}d@9MFW)YAoj0V4x_jiTBPid<^!nF#H}TQ>h#WZpnt{UtRQ#1c>X+Gq!7_#wt z)*FW9&Lz$gy#(VQAXSQd93kI$fOJFZHST9HX@>YSd<>{;4w?KJ$_19T7jE+k>g zrS4b^hS_XmokOTntC|IZN?sHDBi=>g4eK5l_A5{H_zlZUiG*6=unDJ<5c&o08892( zvOxh5#AIv$GtE2*ZwGx!(jbEmlWr1Z{Skc;WQkvSd2*?2tirK zTmRP(@T5%7iK@aM>T*rxYCdKsV9)z!=TW;_1ziUh{u#i(+vpE>@nP*z?S;Gi++F^4 z>364am!G+p58TT|)n9=pl8uU%EJ&$z7!@Oyg);`hMO8{y>naEzQmyat9#nCER!OJ` zhKjJJ+W&Z;3NlNZdnWq`)a(-l9|Lf;^~%aNJE=6w0Qb+$W#Q;L4K?))fa0RL^ziPZ zyDysOpEu7xPM$T}2hDcjw%HQcy}GKpN}dq8Ku9`_Mv0OegxnbgNompaOOp%6)Nx~q zY8{MRJUkX5O%g&pTn4Rjy;6pJOt~shM~V*tAf)P<8rZ&qr|2E%V7v2ETbVn*b&4-G z587QlcrVat>Il4sw6+2`vx-xu?vQjDy)&X*aOsF~4(Cl3A#|s r>l4)6*3Sf;DvtIE@``zVg1#({_Nimz&J?{3#hK3EO8w6qn-2Ux>fm*e9QmulTP=Sdo=iq*Glu+VKi@5*u68 z&oUbil}I|Yxb;jJ=no|%(I%#xwi#AWkzNrXA(bTb&I$OyBt1lHi9Cm-A6B4>c2=@V zSSwkzWHqc`FWM*Z8!dDU=EYeL>@j4ZBq;k5Mq|L1_A(K@6&Ju$p>Wd0?$)403JZ!= z3YMW~(RQ$&f#!~$V(vQUZV~6j4(I7mjuVG>IUkKhvhn!F>SB*`q14a{w|m36zU^}P zv9h{>=nGk)CfB3j+<-Pg8moa4-EQ|uFpgx94P;<+__zPx!bV@~GmAivj~F(-4@%}X1ElUIjC(3epHiBWhqId`C+*qavSf_jUy}dT$Jald_16uqvWeet Cz1d3u diff --git a/security_pattern_miner/src/schemas/__pycache__/libraries_io_response.cpython-311.pyc b/security_pattern_miner/src/schemas/__pycache__/libraries_io_response.cpython-311.pyc deleted file mode 100644 index ca95b69cdff2aec5ee9727ea13d0485ea6e8846b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2713 zcmZuz&2QT_6qjtqv7I=Hn>4BOwPfDfdRyaV19s}VVJ$GESQ21GF98NY5gD6^EUBcl z!UYC6z)nLB!4RN6^w2}Gb=beMgLM;l-D$TD-KnQNQns8ZlAq)se|(Sci&z&W_A{Hz&nYCMtR~#Ggqv3=ZM~4mURSJ)Xi*{QeJNA(MkZU!WS2A9 zl}vUulU>VX*E8AoGT9Bq%6v96=ikp{>xz|?{UCGx!%X&OCVOkh7B20z+nKtJAv<|V z_ans?Q@i!!%oRS#Wbb6Iu$8IXlq~)fPTjUxdO$qlVB+}S6VK(;cUgDGIdVnmD6nnA zF*`&|VZtp&)rl17$YsZ(Y%yZ`1RJI=%7KSRxa0-A9n;0O+aaFWA_8&xjEFhzo2+F% zBa9oC8#r*tZPRH5Qls29xj{J(2p2_(pVDJ0N)pKxxR{b&M|R~7Wz1zpz-&=A5j?BRJjYDQSR$Ekga46$H9D7DVK(UzOZon)OCpl?Ti#VK!NsrMVAs;bsxsDHJ zfqWAszGY01JBGScB(~a+1hL(gm{Y^V#%YNu$t3H>(^jnKw`Bp1j(n}fiJAeXz9>MI z{D6xY+vL8%yN(rC6G1M=py&eYcRgr|g4pet)DcMaDbKA=Tq(nI!EINRTGVd`hXy2j z-IM)fm4N%yJCFqcKa#Bh(G)(#FVUzx64v@u#x zTVhj(a5Xu76gZeVEja*tPON}YzdH<3vU`n^C>p8_h9Sy^tV&=Lz|)5DBrt869gOna zXJXbcOviDdN~$XcL$H|zI&@uIa-nmdwct{W?#*dt`i>#|93tR6Bj z{%cOtdKW&??6uxrZ|`I;s@D4T^R)q52MN)7RG#TQ>^(er7|qT1_Ivv$`(u#U%q*Vq z)6M{`fP`ozDpz_ty`7VtXt6dxQbJUV(n@m+10*Fx3(?HT`|QX`^}_7jjsE<3Wq=^6 zwGgdEOUnZ!B}B{7!qNaq3DMFeJ_#Y6Pv!cVb^612fl9Tv+uJ?ajcTg{Bqclemi0W60mUfcXpIz$bPk$Qwn_f73bhMtL>xHmuzkPtPlbZg`G>-ksEgF7H0 zx&s|M`}~xSYhT-ZS$_fkZi0m9X0+LOU4I4REszjxMb}`9oWtUyEt1lY4cS+tO}f`8 zhyuJ&;2fxv6CbCd*TtseQ;W&}c=48&fAV6xuK~R)-i=S`J$4_4I^5iX$D6#a>yfrO d$dX7~A6%9A#ZwFJnpt@jq%|=&b+% diff --git a/security_pattern_miner/src/utils/__pycache__/libraries_io.cpython-311.pyc b/security_pattern_miner/src/utils/__pycache__/libraries_io.cpython-311.pyc deleted file mode 100644 index 8bdf80f06fcdc2813401182220fe27877f76705c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1070 zcmaJ<&ubGw6rRa$HnB}xY6XjmjXkvFk|p9vtW~O}MQNnLgSjl*-D$Gz?q;2t1rt(; z;6X(2Xh8%IqKFs&1pfkwki)Xz$&&{m&|6Nv*<|y>gD<;pzW3$5nfJY&PjhoK2yFG) z_u3zf&@U0hCJ({sA%Hi?L?*UT1rc-znFD*Dh|%O~CO$&Omr)iLwURA4az&1?Y$qM1 zq5w`3#Z*Y@KsHm)_`#4R4{g)eD8fRn<34)vK8* zqlTa@x2iE>_|)PJ=!$bfU8eraXpl*0RO55YX5%4fRz2@E!gE!^wb5jN6#dY%(1sW# zdkU_Nf`wdr`{(>hcYeKH3>MD!5T4t^1AnsKy-=7Wz81UJwkL^Td8H?f3xf6hht?Fa zd9S-!nq;6lS;nVuHp-m|CDveKguE6ix+m^kxMW24GC$e);FDZpMvXW+oB9jBY;;Sh z&;OCTO>WYQ5Q^K&n&6-k{{)!uuO1i;PNH4|aP?v}Y4FcO$Ic$_qnSTF=E8X}eq3Co7eK>`qmlryS1r{DE3 zQYiKH)+}56BYv4V(I?I*<$O*vRHXicTDB9*HH|dY-=3FS2vB;N$pQ75Ss=_qq~2uO z{4o@hzZs(_6QzNIO1sfO9!Vbu9{kDg#AVA*xhnriLUk_Rs75$Cvjm<6BnuiX0hn$5RBj2|glvpih ziR4fREw|yZtMif!1O_4liGj>OVZiR4$fn6^nI;jLl)9CD+6Oy>NX=;j%G;SUqSD>D zPm;21d)no9xTN*q%F<7K9O Date: Sat, 8 Nov 2025 21:25:49 +0000 Subject: [PATCH 10/26] query constructor --- .../src/config/constants.py | 10 +- .../src/config/queries_loader.py | 6 + .../obscure_token_access_control.yaml | 457 ++++++++++++++++++ .../patterns/opaque_token_authentication.yaml | 320 ++++++++++++ .../password_based_authentication.yaml | 234 +++++++++ .../session_based_access_control.yaml | 366 ++++++++++++++ .../verifiable_token_authentication.yaml | 312 ++++++++++++ .../src/context_retriever/queries_loader.py | 61 +++ .../src/context_retriever/zoekt_retriever.py | 0 security_pattern_miner/src/runner.py | 23 +- .../tests/authentication.py | 1 + 11 files changed, 1787 insertions(+), 3 deletions(-) create mode 100644 security_pattern_miner/src/config/queries_loader.py create mode 100644 security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/obscure_token_access_control.yaml create mode 100644 security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/opaque_token_authentication.yaml create mode 100644 security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml create mode 100644 security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/session_based_access_control.yaml create mode 100644 security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/verifiable_token_authentication.yaml create mode 100644 security_pattern_miner/src/context_retriever/queries_loader.py create mode 100644 security_pattern_miner/src/context_retriever/zoekt_retriever.py create mode 100644 security_pattern_miner/tests/authentication.py diff --git a/security_pattern_miner/src/config/constants.py b/security_pattern_miner/src/config/constants.py index 6cb0188..1f172ef 100644 --- a/security_pattern_miner/src/config/constants.py +++ b/security_pattern_miner/src/config/constants.py @@ -7,4 +7,12 @@ PYPI = "Pypi" GITHUB = "github" MAVEN = "Maven" -SPRING_SECURITY ="org.springframework.security:spring-security-core" \ No newline at end of file +SPRING_SECURITY ="org.springframework.security:spring-security-core" + +PASSWORD_BASED_AUTHENTICATION = "password_based_authentication" +VERIFIABLE_TOKEN_AUTHENTICATION = "verifiable_token_authentication" +OPAQUE_TOKEN_AUTHENTICATION = "opaque_token_authentication" +OBSCURE_TOKEN_ACCESS_CONTROL = "obscure_token_access_control" +SESSION_BASED_ACCESS_CONTROL = "session_based_access_control" + + diff --git a/security_pattern_miner/src/config/queries_loader.py b/security_pattern_miner/src/config/queries_loader.py new file mode 100644 index 0000000..0ff2d25 --- /dev/null +++ b/security_pattern_miner/src/config/queries_loader.py @@ -0,0 +1,6 @@ +import os + +class QueriesLoaderConfig: + root_data_dir = os.getenv("ROOT_DATA_DIR", "/data") + repos_name_dir = os.path.join(root_data_dir, "dependent_repos_info") + output_queries_dir = os.path.join(root_data_dir, "output_queries") \ No newline at end of file diff --git a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/obscure_token_access_control.yaml b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/obscure_token_access_control.yaml new file mode 100644 index 0000000..82e0b58 --- /dev/null +++ b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/obscure_token_access_control.yaml @@ -0,0 +1,457 @@ +pattern: + name: "Obscure Token-Based Access Control (API Keys)" + id: "01_01_005" + description: "A subject is both authenticated and authorized based on an obscure token (API key, PAT). Long-lived tokens that combine authentication and authorization with potentially limited privileges" + +dependencies: + - fastapi + - sqlalchemy + - hashlib # built-in + - secrets # built-in + - datetime # built-in + +roles: + enforcer: + description: "Ensures the token provided by the Subject is verified before processing" + queries: + - query: "APIKeyHeader X-API-Key auto_error" + description: "API key header enforcement" + priority: high + + - query: "APIKeyQuery api_key Security" + description: "API key query parameter enforcement" + priority: medium + + - query: "api_key_header = APIKeyHeader(name=\"X-API-Key\")" + description: "API key header scheme initialization" + priority: high + + - query: "api_key: str = Security(api_key_header)" + description: "API key dependency with Security" + priority: high + + - query: "api_key: Optional[str] = Security(api_key_header)" + description: "Optional API key for mixed auth" + priority: medium + + validator: + description: "Validates API key and checks permissions (combined auth + authz)" + queries: + - query: "class APIKeyValidator required_permissions" + description: "Combined auth+authz validator" + priority: high + + - query: "validate_token HTTPException 401 403" + description: "API key validation with auth/authz errors" + priority: high + + - query: "async def __call__ api_key: str = Security" + description: "Validator callable with API key" + priority: high + + - query: "token_info = manager.validate_token(api_key)" + description: "Token validation call" + priority: high + + - query: "if not token_info: raise HTTPException 401" + description: "Invalid token handling" + priority: medium + + - query: "missing = set(required_permissions) - token_info.permissions" + description: "Missing permission check" + priority: high + + hasher: + description: "Calculates the hash value for a token" + queries: + - query: "hashlib.sha256 token.encode hexdigest" + description: "Token hashing with SHA-256" + priority: high + + - query: "hashlib.blake2b token_hash" + description: "Token hashing with BLAKE2b" + priority: medium + + - query: "def hash_token(token: str) -> str" + description: "Token hashing function" + priority: high + + - query: "token_hash = hashlib.sha256(token.encode()).hexdigest()" + description: "Direct token hashing call" + priority: high + + token_manager: + description: "Keeps track of valid tokens with principals and permissions" + queries: + - query: "class APIKey token_hash principal permissions" + description: "API key database model" + priority: high + + - query: "db.query APIKey token_hash" + description: "API key database queries" + priority: high + + - query: "class APIKeyManager create_api_key validate_token" + description: "API key manager class" + priority: high + + - query: "api_key.permissions api_key.principal" + description: "API key attributes access" + priority: medium + + - query: "expires_at created_at last_used" + description: "API key lifecycle tracking" + priority: medium + + - query: "Column JSON permissions" + description: "Permission storage in database" + priority: medium + + token_generator: + description: "Generates a new token when requested" + queries: + - query: "secrets.token_urlsafe 32" + description: "Secure API key generation (32+ bytes)" + priority: high + + - query: "secrets.token_hex 32" + description: "Hex token generation" + priority: medium + + - query: "def generate_token() -> str" + description: "Token generation function" + priority: high + + - query: "@staticmethod def generate_token" + description: "Static token generator method" + priority: medium + + registrar: + description: "Generates new tokens and registers them with the token manager" + queries: + - query: "@app.post /api-keys secrets.token" + description: "API key creation endpoint" + priority: high + + - query: "token_hash = hashlib.sha256 db.add APIKey" + description: "API key registration with hashing" + priority: high + + - query: "return api_key Save this key" + description: "API key response (shown once)" + priority: high + + - query: "def create_api_key principal permissions expires_delta" + description: "API key creation function signature" + priority: high + + - query: "async def create_api_key principal: str = Depends(get_authenticated_user)" + description: "API key creation requiring authentication" + priority: high + + permission_checker: + description: "Checks if token has required permissions" + queries: + - query: "token_info.permissions HTTPException 403" + description: "Permission verification" + priority: high + + - query: "if self.required_permissions missing_perms" + description: "Required permission check" + priority: high + + - query: "set(required_permissions) - token_info.permissions" + description: "Missing permission calculation" + priority: high + +complete_implementation: + description: "Queries to find files with complete API key authentication implementation" + queries: + - query: "APIKeyHeader hashlib.sha256 secrets.token db.query APIKey lang:python" + description: "Complete API key authentication pattern" + priority: critical + min_matches: 4 + + - query: "@app.post /api-keys secrets.token_urlsafe hash db.add lang:python -file:test" + description: "API key creation endpoint" + priority: high + min_matches: 3 + + - query: "class APIKeyValidator validate_token permissions HTTPException lang:python" + description: "API key validator with authorization" + priority: high + min_matches: 3 + +endpoints: + create_api_key: + queries: + - "@app.post /api-keys" + - "async def create_api_key name: str permissions: List[str]" + - "token = secrets.token_urlsafe" + - "message Save this key securely" + + list_api_keys: + queries: + - "@app.get /api-keys" + - "def list_api_keys principal = Depends" + - "manager.list_user_keys(principal)" + + revoke_api_key: + queries: + - "@app.delete /api-keys/{key_id}" + - "async def revoke_api_key key_id: str" + - "db.delete(api_key)" + + protected_endpoint: + queries: + - "token_info: TokenInfo = Depends(APIKeyValidator" + - "token_info = Depends(require_api_permissions" + - "api_key: str = Security(api_key_header)" + +api_key_features: + named_keys: + queries: + - "name: Optional[str] api_key.name" + - "Create API key with name" + + expiration: + queries: + - "expires_at: Optional[DateTime]" + - "expires_delta: Optional[timedelta]" + - "if api_key.expires_at and datetime.utcnow() > api_key.expires_at" + + last_used_tracking: + queries: + - "last_used: Optional[DateTime]" + - "api_key.last_used = datetime.utcnow()" + - "update_last_used" + + scoped_permissions: + queries: + - "permissions: List[str]" + - "api_key.permissions" + - "token-specific privileges" + + multiple_keys: + queries: + - "list_user_keys(principal)" + - "for key in keys" + - "api_keys: List[APIKey]" + +permission_models: + permission_enum: + queries: + - "class Permission str Enum" + - "Permission.DATA_READ Permission.DATA_WRITE" + - "Set[str] permissions" + + token_info: + queries: + - "class TokenInfo principal permissions" + - "TokenInfo(principal=, permissions=)" + - "token_info.principal token_info.permissions" + + permission_check: + queries: + - "def require_api_permissions(permissions: List[str])" + - "APIKeyValidator(required_permissions=permissions)" + - "missing = set(required) - token_info.permissions" + +security_features: + hash_storage: + queries: + - "token_hash = hashlib.sha256(token.encode()).hexdigest()" + - "Column(String, unique=True) token_hash" + - "hash_token store_hash" + + show_once: + queries: + - "Save this key securely won't be shown again" + - "return {\"api_key\": token, \"message\":" + - "plaintext token database hash" + + revocation: + queries: + - "db.delete(api_key)" + - "revoke_token invalidate_key" + - "@app.delete /api-keys" + + entropy: + queries: + - "secrets.token_urlsafe(32)" + - "secrets.token_urlsafe(64)" + - "128 bits 256 bits" + +sensitive_actions: + description: "Actions that should NOT be allowed with API keys" + queries: + - query: "@app.post /account/delete Depends(get_authenticated_user) -APIKey" + description: "Account deletion requiring full auth" + priority: high + + - query: "@app.post /api-keys Depends(get_authenticated_user)" + description: "API key creation requiring password auth" + priority: high + + - query: "@app.post /password/change -api_key -APIKey" + description: "Password change not accessible via API key" + priority: high + +anti_patterns: + description: "Security anti-patterns to detect" + + plaintext_storage: + query: "Column String api_key -hash -token_hash" + severity: critical + + weak_tokens: + query: "secrets.token_urlsafe(8) secrets.token_urlsafe(16)" + severity: critical + + no_hashing: + query: "api_key = secrets.token db.add -hash" + severity: critical + + predictable_tokens: + query: "uuid.uuid4() random.randint -secrets" + severity: critical + + no_expiration: + query: "class APIKey -expires_at -expiration" + severity: medium + + overprivileged_keys: + query: "permissions = [\"*\"] permissions = [\"all\"]" + severity: high + + api_key_for_sensitive: + query: "@app.delete /account api_key APIKeyHeader" + severity: high + +filters: + language: python + file_extension: "\\.py$" + exclude_tests: true + exclude_forks: true + exclude_archived: true + +search_strategy: + steps: + - name: "Find Enforcer" + queries: ["APIKeyHeader", "APIKeyQuery", "X-API-Key"] + + - name: "Find Token Generator" + queries: ["secrets.token_urlsafe", "secrets.token_hex"] + + - name: "Find Hasher" + queries: ["hashlib.sha256", "hash_token"] + + - name: "Find Token Manager" + queries: ["class APIKey", "token_hash principal permissions"] + + - name: "Find Validator" + queries: ["validate_token", "APIKeyValidator"] + + - name: "Verify Security" + queries: ["hashlib secrets db.query", "httponly=False secure=False"] + +best_practices: + token_generation: + queries: + - "secrets.token_urlsafe(32)" + - "secrets.token_urlsafe(64)" + - "token_bytes(32)" + description: "At least 128 bits (16 bytes) of entropy" + + hash_algorithm: + queries: + - "hashlib.sha256" + - "hashlib.blake2b" + - "hashlib.sha3_256" + description: "Use secure hash functions" + + permission_scoping: + queries: + - "permissions: List[str]" + - "limited_permissions subset" + - "read-only write-only" + description: "Scope permissions per API key" + + expiration: + queries: + - "expires_at: Optional[DateTime]" + - "timedelta(days=90)" + - "check_expiration" + description: "Set expiration dates for API keys" + + revocation: + queries: + - "@app.delete /api-keys/{key_id}" + - "revoke_token invalidate" + - "db.delete(api_key)" + description: "Allow users to revoke API keys" + +database_models: + sqlalchemy: + queries: + - "class APIKey(Base):" + - "__tablename__ = \"api_keys\"" + - "id = Column(String, primary_key=True)" + - "token_hash = Column(String, unique=True, nullable=False)" + - "principal = Column(String, nullable=False)" + - "permissions = Column(JSON)" + - "created_at = Column(DateTime, default=datetime.utcnow)" + - "expires_at = Column(DateTime, nullable=True)" + - "last_used = Column(DateTime, nullable=True)" + + tortoise: + queries: + - "class APIKey(Model):" + - "token_hash = fields.CharField(max_length=64, unique=True)" + - "principal = fields.CharField(max_length=255)" + - "permissions = fields.JSONField()" + - "created_at = fields.DatetimeField(auto_now_add=True)" + +integration_patterns: + with_session_auth: + queries: + - "session_id = Depends(cookie_scheme) api_key = Security(api_key_header)" + - "verify_session or verify_api_key" + - "Optional[str] = Security multiple auth" + + with_oauth: + queries: + - "OAuth2PasswordBearer APIKeyHeader" + - "token or api_key" + - "bearer_token api_key dual authentication" + + rate_limiting: + queries: + - "rate_limit = Depends(RateLimiter" + - "api_key rate_limit" + - "slowapi api_key" + +use_cases: + personal_access_tokens: + queries: + - "PAT personal access token" + - "github_token gitlab_token" + - "user-generated API key" + + service_accounts: + queries: + - "service_account api_key" + - "machine-to-machine authentication" + - "automated_access" + + webhooks: + queries: + - "webhook_secret api_key" + - "callback authentication" + - "verify_webhook_signature" + + third_party_integrations: + queries: + - "integration_key partner_key" + - "external_service api_key" + - "third_party authentication" diff --git a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/opaque_token_authentication.yaml b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/opaque_token_authentication.yaml new file mode 100644 index 0000000..ed688b6 --- /dev/null +++ b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/opaque_token_authentication.yaml @@ -0,0 +1,320 @@ +pattern: + name: "Opaque Token-Based Authentication (Session)" + id: "01_01_004" + description: "A subject is authenticated based on a unique, opaque token. The token is meaningless on its own; the system maintains a mapping of tokens to principals" + +dependencies: + - fastapi + - redis[asyncio] + - cachetools # alternative to redis + - secrets # built-in + +roles: + enforcer: + description: "Ensures the token provided by the Subject is verified before the requested action is processed" + queries: + - query: "APIKeyCookie name session_id" + description: "Cookie-based session enforcement" + priority: high + + - query: "APIKeyHeader X-Session-Token" + description: "Header-based session enforcement" + priority: high + + - query: "cookie_scheme = APIKeyCookie" + description: "Cookie security scheme initialization" + priority: high + + - query: "session_id: str = Depends(cookie_scheme)" + description: "Session ID dependency injection" + priority: high + + verifier: + description: "Verifies whether the received token corresponds to a known principal" + queries: + - query: "async def verify_session session_manager" + description: "Session verification functions" + priority: high + + - query: "session_id HTTPException 401" + description: "Session validation with error handling" + priority: high + + - query: "def get_current_user session_id = Depends" + description: "Current user extraction from session" + priority: high + + - query: "session_manager.get_principal(session_id)" + description: "Principal retrieval from session" + priority: high + + - query: "if principal is None raise HTTPException" + description: "Invalid session error handling" + priority: medium + + principal_provider_redis: + description: "Keeps track of valid tokens and principals using Redis" + queries: + - query: "redis.setex session: principal" + description: "Redis session storage" + priority: high + + - query: "await redis.get session:" + description: "Async Redis session retrieval" + priority: high + + - query: "redis.from_url redis://" + description: "Redis connection setup" + priority: high + + - query: "redis_client.setex(f\"session:{token}\"" + description: "Session storage with f-string key" + priority: high + + - query: "await redis.delete session:" + description: "Session deletion" + priority: medium + + principal_provider_memory: + description: "Keeps track of valid tokens and principals using in-memory cache" + queries: + - query: "TTLCache maxsize ttl" + description: "In-memory session cache" + priority: medium + + - query: "from cachetools import TTLCache" + description: "TTLCache import" + priority: medium + + - query: "sessions = TTLCache" + description: "Session cache initialization" + priority: medium + + - query: "sessions[token] = principal" + description: "In-memory session storage" + priority: low + + token_generator: + description: "Generates a new token when requested" + queries: + - query: "secrets.token_urlsafe secrets.token_hex" + description: "Secure session token generation" + priority: high + + - query: "session_id = secrets.token_urlsafe(32)" + description: "Session ID generation (32 bytes)" + priority: high + + - query: "token = secrets.token_hex" + description: "Hex token generation" + priority: medium + + - query: "secrets.token_bytes random" + description: "Random token generation" + priority: low + + registrar: + description: "Handles the registration of tokens for principals" + queries: + - query: "@app.post /login response.set_cookie session_id" + description: "Login with session cookie creation" + priority: high + + - query: "redis.setex httponly secure" + description: "Session registration with secure cookies" + priority: high + + - query: "async def login response: Response session_manager" + description: "Login endpoint with session manager" + priority: high + + - query: "session_id = await session_manager.create_session" + description: "Session creation call" + priority: high + + - query: "response.set_cookie httponly=True secure=True" + description: "Secure cookie attributes" + priority: medium + + session_manager: + description: "Manages the lifecycle of sessions including creation, validation, and invalidation" + queries: + - query: "class SessionManager create_session" + description: "Session management class" + priority: high + + - query: "async def get_principal async def invalidate_session" + description: "Session lifecycle methods" + priority: high + + - query: "session_timeout absolute_timeout" + description: "Session timeout configuration" + priority: high + + - query: "def create_session(self, principal: str) -> str" + description: "Session creation method signature" + priority: high + + - query: "last_activity datetime.utcnow" + description: "Activity-based timeout tracking" + priority: medium + + - query: "invalidate_all_sessions principal" + description: "Multiple session invalidation" + priority: low + +complete_implementation: + description: "Queries to find files with complete session authentication implementation" + queries: + - query: "APIKeyCookie secrets.token redis.setex response.set_cookie lang:python" + description: "Complete session authentication pattern" + priority: critical + min_matches: 4 + + - query: "@app.post /login session_manager.create_session httponly lang:python -file:test" + description: "Login endpoint with session management" + priority: high + min_matches: 3 + + - query: "class SessionManager redis async def create_session lang:python" + description: "Session manager with Redis backend" + priority: high + min_matches: 3 + +endpoints: + login: + queries: + - "@app.post /login response.set_cookie" + - "async def login response: Response" + - "session_id = secrets.token httponly" + + logout: + queries: + - "@app.post /logout invalidate_session" + - "response.delete_cookie session_id" + - "await session_manager.invalidate_session" + + protected_endpoint: + queries: + - "async def protected_route principal: str = Depends(get_current_user)" + - "session_id = Depends(cookie_scheme)" + +session_features: + timeout_management: + queries: + - "session_timeout = 1800" + - "absolute_timeout = 43200" + - "SESSION_TIMEOUT timedelta" + - "max_age expires" + + activity_tracking: + queries: + - "last_activity datetime.utcnow" + - "created_at last_activity" + - "update_last_activity" + + secure_cookies: + queries: + - "httponly=True secure=True" + - "samesite=\"lax\" samesite=\"strict\"" + - "set_cookie domain path" + + session_data: + queries: + - "session_data = {\"principal\":" + - "SessionData principal role" + - "json() parse_raw" + +storage_backends: + redis: + queries: + - "import redis.asyncio as redis" + - "await redis.from_url" + - "redis.setex redis.get redis.delete" + - "redis_client = redis.Redis" + + in_memory: + queries: + - "from cachetools import TTLCache" + - "sessions: Dict[str, SessionData]" + - "threading.Lock sessions" + +anti_patterns: + description: "Security anti-patterns to detect" + + predictable_tokens: + query: "session_id = str(uuid.uuid4()) -secrets" + severity: critical + + short_tokens: + query: "secrets.token_urlsafe(8) secrets.token_urlsafe(16)" + severity: high + + insecure_cookies: + query: "set_cookie -httponly -secure" + severity: critical + + no_timeout: + query: "set_cookie -max_age -expires" + severity: high + + weak_random: + query: "random.randint session_id" + severity: critical + + session_fixation: + query: "session_id = request.cookies.get -create_session" + severity: high + +filters: + language: python + file_extension: "\\.py$" + exclude_tests: true + exclude_forks: true + exclude_archived: true + +search_strategy: + steps: + - name: "Find Enforcer" + queries: ["APIKeyCookie", "APIKeyHeader"] + + - name: "Find Token Generator" + queries: ["secrets.token_urlsafe", "secrets.token_hex"] + + - name: "Find Session Storage" + queries: ["redis.setex session:", "TTLCache sessions"] + + - name: "Find Session Manager" + queries: ["class SessionManager", "create_session invalidate_session"] + + - name: "Verify Secure Implementation" + queries: ["httponly=True secure=True", "session_timeout"] + +best_practices: + token_generation: + queries: + - "secrets.token_urlsafe(32)" + - "secrets.token_urlsafe(64)" + - "token_bytes(32)" + description: "At least 128 bits (16 bytes) of entropy" + + cookie_security: + queries: + - "httponly=True" + - "secure=True" + - "samesite=\"lax\"" + - "samesite=\"strict\"" + description: "Secure cookie attributes" + + timeout_config: + queries: + - "session_timeout = 1800" + - "absolute_timeout =" + - "max_age =" + description: "Proper timeout configuration" + + session_regeneration: + queries: + - "invalidate_session create_session" + - "new_session_id = secrets" + description: "Session regeneration on privilege change" diff --git a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml new file mode 100644 index 0000000..9295e83 --- /dev/null +++ b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml @@ -0,0 +1,234 @@ +pattern: + name: "Password-Based Authentication" + id: "01_01_002" + description: "A subject proves they own a claimed identity by providing a correct identifier and the corresponding password" + +dependencies: + - fastapi + - passlib + - cryptography + +repo_metadata_file: + - python_Pypi_mutual_dependents_fastapi_passlib_cryptography.jsonl + +roles: + enforcer: + description: "Ensures the requested action is only performed if the Subject is successfully authenticated" + queries: + - query: "Depends OAuth2PasswordRequestForm OAuth2PasswordRequestFormRestrict" + description: "Files containing both OAuth2 password authentication classes" + priority: high + + - query: "Depends HTTPBasic HTTPBasicCredentials" + description: "Files implementing HTTP Basic authentication" + priority: medium + + verification_manager: + description: "Responsible to collect inputs necessary to verify a Subject's password" + queries: + - query: "CryptContext pwd_context verify" + description: "Files with password verification context" + priority: high + + - query: "def verify_password lang:python" + description: "Python functions that verify passwords" + priority: high + + - query: "authenticate_user username password" + description: "User authentication functions" + priority: high + + comparator: + description: "Compares the hash value of the received password against stored hash" + queries: + - query: "pwd_context.verify bcrypt.checkpw" + description: "Files containing password comparison functions" + priority: high + + - query: "pwd_context.verify(plain_password, hashed_password)" + description: "Direct password verification calls" + priority: high + + - query: "bcrypt.checkpw password.encode" + description: "Bcrypt password checking" + priority: medium + + hasher: + description: "Calculates the hash value for a given input" + queries: + - query: "pwd_context.hash bcrypt.hashpw" + description: "Files with password hashing implementations" + priority: high + + - query: "CryptContext schemes bcrypt" + description: "Files configuring bcrypt hashing" + priority: high + + - query: "pwd_context.hash(password)" + description: "Direct password hashing calls" + priority: high + + - query: "bcrypt.gensalt bcrypt.hashpw" + description: "Bcrypt salt generation and hashing" + priority: medium + + password_store: + description: "Keeps track of hash values corresponding to each registered identity" + queries: + - query: "hashed_password User.query" + description: "Files with user password storage queries" + priority: high + + - query: "class User password sqlalchemy" + description: "SQLAlchemy User models with password fields" + priority: high + + - query: "Column String hashed_password" + description: "Database column for hashed passwords" + priority: high + + - query: "db.query(User).filter username" + description: "User lookup queries" + priority: medium + + pepper_store: + description: "Keeps track of the pepper value(s) used by the system" + queries: + - query: "Fernet pepper_key encrypt" + description: "Files implementing pepper encryption" + priority: medium + + - query: "pepper cipher.encrypt" + description: "Pepper encryption operations" + priority: medium + + encrypter: + description: "Encrypts a given data element using a given cryptographic key" + queries: + - query: "from cryptography.fernet import Fernet" + description: "Files importing Fernet encryption" + priority: medium + + - query: "Fernet.generate_key cipher.encrypt" + description: "Fernet key generation and encryption" + priority: medium + + registrar: + description: "Handles the Subject's registration" + queries: + - query: "@app.post /register pwd_context.hash" + description: "Registration endpoints with password hashing" + priority: high + + - query: "def register_user db.add" + description: "User registration functions" + priority: high + + - query: "@app.post /signup create_user" + description: "Signup endpoints" + priority: medium + + - query: "new_user = User db.commit" + description: "User creation and database commit" + priority: medium + + password_policy: + description: "Contains the rules any password should satisfy" + queries: + - query: "@validator password len" + description: "Pydantic password validators" + priority: high + + - query: "class PasswordPolicy validate" + description: "Password policy validation classes" + priority: medium + + - query: "password len min_length max_length" + description: "Password length validation" + priority: medium + + - query: "raise ValueError password must" + description: "Password validation errors" + priority: low + + srng: + description: "Cryptographically secure random number generator" + queries: + - query: "secrets.token_urlsafe secrets.token_bytes" + description: "Secure random number generation" + priority: high + + - query: "import secrets token" + description: "Secrets module usage for tokens" + priority: medium + +complete_implementation: + description: "Queries to find files with complete password authentication implementation" + queries: + - query: "OAuth2PasswordBearer CryptContext pwd_context.hash pwd_context.verify lang:python" + description: "Complete password authentication pattern" + priority: critical + min_matches: 4 + + - query: "@app.post /login authenticate_user pwd_context lang:python -file:test" + description: "Login endpoint with authentication" + priority: high + min_matches: 3 + +endpoints: + login: + queries: + - "@app.post /login /signin /auth" + - "async def login form_data: OAuth2PasswordRequestForm" + + register: + queries: + - "@app.post /register /signup" + - "def register_user username password" + + password_reset: + queries: + - "@app.post /reset-password /forgot-password" + - "reset_token send_email" + +anti_patterns: + description: "Security anti-patterns to detect" + weak_password_storage: + query: "password = User.password -hash -bcrypt -crypt" + severity: critical + + hardcoded_secrets: + query: "SECRET_KEY = \"" + severity: critical + + no_https: + query: "set_cookie -secure -httponly" + severity: high + + weak_random: + query: "random.randint -secrets -uuid4" + severity: high + +filters: + language: python + file_extension: "\\.py$" + exclude_tests: true + exclude_forks: true + exclude_archived: true + +search_strategy: + steps: + - name: "Find Enforcer" + queries: ["OAuth2PasswordBearer", "HTTPBasic"] + + - name: "Find Verification Logic" + queries: ["pwd_context.verify", "authenticate_user"] + + - name: "Find Password Storage" + queries: ["class User", "hashed_password Column"] + + - name: "Find Registration" + queries: ["@app.post /register", "pwd_context.hash db.add"] + + - name: "Verify Complete Implementation" + queries: ["OAuth2PasswordBearer CryptContext pwd_context"] diff --git a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/session_based_access_control.yaml b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/session_based_access_control.yaml new file mode 100644 index 0000000..ad7a020 --- /dev/null +++ b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/session_based_access_control.yaml @@ -0,0 +1,366 @@ +pattern: + name: "Session-Based Access Control" + id: "01_01_006" + description: "A subject is authenticated using a session identifier and authorized based on permissions associated with that session. Combines opaque token authentication with authorization" + +dependencies: + - fastapi + - redis[asyncio] + - sqlalchemy + - pydantic + - secrets # built-in + +roles: + authentication_enforcer: + description: "Ensures the session id provided by the Subject is verified before processing" + queries: + - query: "APIKeyCookie session_id Depends" + description: "Session-based authentication enforcement" + priority: high + + - query: "cookie_scheme = APIKeyCookie(name=\"session_id\")" + description: "Cookie scheme for sessions" + priority: high + + - query: "session_id: str = Depends(cookie_scheme)" + description: "Session dependency injection" + priority: high + + verifier: + description: "Verifies whether the received session identifier corresponds to an open session" + queries: + - query: "session_manager.get_session HTTPException" + description: "Session verification with error handling" + priority: high + + - query: "async def verify_session session = await session_manager.get_session" + description: "Session verification function" + priority: high + + - query: "if session is None raise HTTPException 401" + description: "Invalid session handling" + priority: high + + session_manager: + description: "Keeps track of open sessions with principal and permission data" + queries: + - query: "class SessionManager permissions role" + description: "Session manager with authorization data" + priority: high + + - query: "redis.setex session: SessionData.json" + description: "Session storage with permissions" + priority: high + + - query: "SessionData principal role permissions" + description: "Session data model with authorization" + priority: high + + - query: "async def create_session get_user_permissions" + description: "Session creation with permission retrieval" + priority: high + + - query: "session_data = SessionData(principal, role, permissions)" + description: "Session data initialization" + priority: medium + + session_id_generator: + description: "Generates a new session id when requested" + queries: + - query: "secrets.token_urlsafe 32" + description: "Session ID generation (32 bytes)" + priority: high + + - query: "token = secrets.token_urlsafe" + description: "Token generation for session" + priority: medium + + authorization_enforcer: + description: "Ensures the requested action is only executed if allowed by the authorization policy" + queries: + - query: "class AuthorizationChecker required_permissions" + description: "Authorization enforcement class" + priority: high + + - query: "def __call__ session_id required_permissions" + description: "Authorization checker callable" + priority: high + + - query: "async def __call__ session: SessionData = Depends" + description: "Authorization dependency with session" + priority: high + + - query: "require_permissions(permissions: List" + description: "Permission requirement factory" + priority: medium + + decider: + description: "Verifies the Subject's access request against the authorization policy" + queries: + - query: "required_permissions.issubset session.permissions" + description: "Permission checking logic" + priority: high + + - query: "HTTPException 403 Missing permissions" + description: "Authorization denial" + priority: high + + - query: "missing_perms = set(required_permissions) - session.permissions" + description: "Missing permission calculation" + priority: high + + - query: "if session.role == Role.ADMIN" + description: "Role-based authorization" + priority: medium + + - query: "for permission in required_permissions" + description: "Permission iteration check" + priority: low + + policy_provider: + description: "Keeps track of the applied authorization policy" + queries: + - query: "class PolicyProvider get_role_permissions" + description: "Policy management class" + priority: high + + - query: "role_permissions user_roles" + description: "Role-permission mapping" + priority: high + + - query: "db.query Role permissions" + description: "Database role queries" + priority: high + + - query: "def get_user_permissions(principal: str)" + description: "User permission retrieval" + priority: high + + - query: "self.role_permissions.get(role, set())" + description: "Permission set retrieval" + priority: medium + + - query: "Enum Permission READ WRITE DELETE" + description: "Permission enumeration" + priority: medium + + registrar: + description: "Allows the Subject to open a new session through authentication" + queries: + - query: "create_session policy_provider.get_user_permissions" + description: "Session creation with permissions" + priority: high + + - query: "@app.post /login session_manager.create_session" + description: "Login endpoint with session creation" + priority: high + + - query: "authenticate_user create_session response.set_cookie" + description: "Authentication followed by session creation" + priority: high + +complete_implementation: + description: "Queries to find files with complete session-based access control implementation" + queries: + - query: "APIKeyCookie required_permissions session.permissions HTTPException 403 lang:python" + description: "Complete session-based access control pattern" + priority: critical + min_matches: 4 + + - query: "class SessionManager class PolicyProvider get_role_permissions lang:python -file:test" + description: "Session manager with policy provider" + priority: high + min_matches: 3 + + - query: "@app.post /login create_session get_user_permissions httponly lang:python" + description: "Login with permission-aware session creation" + priority: high + min_matches: 3 + +endpoints: + login_with_authz: + queries: + - "@app.post /login session_manager.create_session policy_provider" + - "role = policy_provider.get_user_role" + - "permissions = policy_provider.get_user_permissions" + + protected_with_permissions: + queries: + - "session: SessionData = Depends(require_permissions" + - "async def protected_route Depends(AuthorizationChecker" + - "session: SessionData = Depends(check_permissions" + + role_based: + queries: + - "if session.role != Role.ADMIN" + - "session: SessionData Role.USER" + + logout: + queries: + - "@app.post /logout invalidate_session" + - "response.delete_cookie session_id" + +authorization_models: + permissions: + queries: + - "class Permission Enum" + - "Permission.READ Permission.WRITE Permission.DELETE" + - "Set[Permission]" + + roles: + queries: + - "class Role Enum" + - "Role.ADMIN Role.USER Role.GUEST" + - "user.role session.role" + + session_data: + queries: + - "class SessionData principal role permissions" + - "SessionData(principal=, role=, permissions=)" + - "session.permissions session.role" + + policy_mapping: + queries: + - "role_permissions: Dict[Role, Set[Permission]]" + - "user_roles: Dict[str, Role]" + - "get_role_permissions get_user_role" + +permission_checking: + subset_check: + queries: + - "required_permissions.issubset(session.permissions)" + - "set(required_permissions) <= session.permissions" + + individual_check: + queries: + - "if permission in session.permissions" + - "for perm in required_permissions" + + role_override: + queries: + - "if session.role == Role.ADMIN: return True" + - "is_admin = session.role == Role.ADMIN" + + missing_permissions: + queries: + - "missing = set(required_permissions) - session.permissions" + - "if missing_perms: raise HTTPException(403" + +advanced_authorization: + attribute_based: + queries: + - "class ABACDecider policy_function" + - "def policy_function(session: SessionData, context: Dict" + - "context.get(\"resource_owner\")" + + resource_ownership: + queries: + - "if session.principal == resource.owner" + - "resource_owner == session.principal" + + time_based: + queries: + - "datetime.now() working_hours" + - "if current_time.hour" + + location_based: + queries: + - "request.client.host allowed_ips" + - "session.ip_address" + +factories_and_helpers: + permission_factory: + queries: + - "def require_permissions(permissions: List[Permission])" + - "return AuthorizationChecker(required_permissions=permissions)" + + role_factory: + queries: + - "def require_role(role: Role)" + - "def admin_only()" + + combined_checker: + queries: + - "def require_auth_and_authz" + - "verify_credentials check_permissions" + +anti_patterns: + description: "Security anti-patterns to detect" + + authorization_bypass: + query: "if True: pass # TODO: implement authorization" + severity: critical + + permission_disabled: + query: "ENABLE_AUTHORIZATION = False" + severity: critical + + role_hardcoded: + query: "if username == \"admin\"" + severity: high + + no_permission_check: + query: "@app.delete -required_permissions -Depends" + severity: high + + session_without_permissions: + query: "class SessionData principal -permissions -role" + severity: medium + +filters: + language: python + file_extension: "\\.py$" + exclude_tests: true + exclude_forks: true + exclude_archived: true + +search_strategy: + steps: + - name: "Find Authentication Enforcer" + queries: ["APIKeyCookie session_id"] + + - name: "Find Authorization Enforcer" + queries: ["class AuthorizationChecker", "require_permissions"] + + - name: "Find Session Manager" + queries: ["class SessionManager permissions role"] + + - name: "Find Policy Provider" + queries: ["class PolicyProvider", "get_role_permissions"] + + - name: "Find Permission Checking" + queries: ["required_permissions.issubset", "HTTPException 403"] + + - name: "Verify Complete Implementation" + queries: ["SessionData permissions HTTPException 403"] + +integration_patterns: + session_with_permissions: + queries: + - "session_data = SessionData(principal=username, role=role, permissions=perms)" + - "get_user_permissions principal" + + login_flow: + queries: + - "authenticate_user -> get_user_role -> get_user_permissions -> create_session" + - "@app.post /login authenticate_user policy_provider" + + protected_endpoint_flow: + queries: + - "verify_session -> check_permissions -> execute_action" + - "Depends(verify) Depends(authorize)" + +rbac_vs_abac: + rbac: + description: "Role-Based Access Control patterns" + queries: + - "role_permissions: Dict[Role, Set[Permission]]" + - "if session.role == Role.ADMIN" + - "get_role_permissions(role)" + + abac: + description: "Attribute-Based Access Control patterns" + queries: + - "policy_function(session: SessionData, context: Dict[str, Any])" + - "subject_attributes resource_attributes action_attributes" + - "evaluate_policy(subject, resource, action, environment)" diff --git a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/verifiable_token_authentication.yaml b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/verifiable_token_authentication.yaml new file mode 100644 index 0000000..efa9a69 --- /dev/null +++ b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/verifiable_token_authentication.yaml @@ -0,0 +1,312 @@ +pattern: + name: "Verifiable Token-Based Authentication (JWT)" + id: "01_01_003" + description: "A subject is identified based on a token that contains sufficient information to determine the correct principal, verified through digital signature or MAC" + +dependencies: + - fastapi + - python-jose[cryptography] + - PyJWT + - cryptography + - redis # optional, for token revocation + +roles: + enforcer: + description: "Ensures the requested action is only performed if the Subject is successfully authenticated" + queries: + - query: "HTTPBearer OAuth2PasswordBearer" + description: "Bearer token enforcement" + priority: high + + - query: "HTTPAuthorizationCredentials Depends" + description: "HTTP authorization credentials dependency" + priority: high + + - query: "security = HTTPBearer()" + description: "HTTPBearer security scheme initialization" + priority: high + + - query: "credentials: HTTPAuthorizationCredentials = Depends(security)" + description: "Bearer token dependency injection" + priority: high + + verifier: + description: "Manages the verification of whether a token is valid" + queries: + - query: "jwt.decode SECRET_KEY algorithms" + description: "JWT token verification" + priority: high + + - query: "def verify_token jwt.decode" + description: "Token verification functions" + priority: high + + - query: "jwt.decode(token, SECRET_KEY, algorithms" + description: "Direct JWT decode calls" + priority: high + + - query: "payload = jwt.decode JWTError" + description: "JWT decode with error handling" + priority: high + + - query: "credentials.credentials jwt.decode" + description: "Extracting and verifying JWT from credentials" + priority: medium + + cryptographer_mac: + description: "Provides cryptographic primitives for MAC-based tokens (HMAC)" + queries: + - query: "jwt.encode HS256 SECRET_KEY" + description: "JWT encoding with HMAC" + priority: high + + - query: "from jose import jwt" + description: "Python-jose JWT import" + priority: high + + - query: "algorithm = \"HS256\" jwt.encode" + description: "HMAC algorithm configuration" + priority: high + + - query: "jwt.encode jwt.decode HS256" + description: "Complete HMAC JWT operations" + priority: medium + + cryptographer_signature: + description: "Provides cryptographic primitives for digitally signed tokens (RSA)" + queries: + - query: "jwt.encode RS256 private_key" + description: "JWT encoding with RSA signature" + priority: high + + - query: "jwt.decode public_key RS256" + description: "JWT verification with public key" + priority: high + + - query: "algorithm = \"RS256\" jwt" + description: "RSA signature algorithm configuration" + priority: high + + - query: "RS256 RS384 RS512 jwt" + description: "RSA algorithm variants" + priority: medium + + key_manager_hmac: + description: "Manages cryptographic keys for HMAC tokens" + queries: + - query: "SECRET_KEY = os.getenv" + description: "Secret key from environment" + priority: high + + - query: "SECRET_KEY ALGORITHM = \"HS256\"" + description: "HMAC key and algorithm configuration" + priority: high + + - query: "load_dotenv SECRET_KEY" + description: "Loading secret keys from environment" + priority: medium + + key_manager_rsa: + description: "Manages cryptographic keys for RSA signed tokens" + queries: + - query: "rsa.generate_private_key public_exponent" + description: "RSA key generation" + priority: high + + - query: "from cryptography.hazmat.primitives.asymmetric import rsa" + description: "RSA key management imports" + priority: high + + - query: "private_key.public_key() serialization" + description: "Public key extraction and serialization" + priority: high + + - query: "serialization.PrivateFormat.PKCS8" + description: "Private key serialization" + priority: medium + + - query: "serialization.PublicFormat.SubjectPublicKeyInfo" + description: "Public key serialization" + priority: medium + + token_generator: + description: "Manages the generation of new tokens" + queries: + - query: "def create_access_token jwt.encode" + description: "Access token creation functions" + priority: high + + - query: "jwt.encode exp sub" + description: "JWT encoding with expiration and subject" + priority: high + + - query: "create_access_token data: dict expires_delta" + description: "Token creation with expiration parameter" + priority: high + + - query: "timedelta(minutes jwt.encode" + description: "Token expiration time calculation" + priority: medium + + - query: "datetime.utcnow() + expires_delta" + description: "Expiration timestamp calculation" + priority: low + + registrar: + description: "Provides the Subject a token after successful authentication" + queries: + - query: "@app.post /token create_access_token" + description: "Token issuance endpoints" + priority: high + + - query: "return access_token token_type bearer" + description: "Token response formatting" + priority: high + + - query: "@app.post /login jwt.encode" + description: "Login endpoint with JWT generation" + priority: high + + - query: "authenticate_user create_access_token" + description: "Authentication followed by token creation" + priority: medium + + token_blacklist: + description: "Tracks revoked but not yet expired tokens (optional feature)" + queries: + - query: "redis.sadd revoked_tokens jwt" + description: "Token revocation with Redis" + priority: medium + + - query: "blacklist.revoke token" + description: "Token blacklist management" + priority: medium + + - query: "class TokenBlacklist revoked_tokens" + description: "Token blacklist class" + priority: medium + + - query: "is_revoked token HTTPException" + description: "Revocation checking" + priority: low + +complete_implementation: + description: "Queries to find files with complete JWT authentication implementation" + queries: + - query: "HTTPBearer jwt.encode jwt.decode SECRET_KEY lang:python" + description: "Complete JWT authentication pattern" + priority: critical + min_matches: 4 + + - query: "create_access_token jwt.encode exp sub lang:python -file:test" + description: "Token generation with proper claims" + priority: high + min_matches: 3 + + - query: "@app.post /token authenticate_user jwt.encode lang:python" + description: "Token endpoint with authentication" + priority: high + min_matches: 3 + +endpoints: + token_issuance: + queries: + - "@app.post /token" + - "async def login_for_access_token" + - "return {\"access_token\": access_token, \"token_type\": \"bearer\"}" + + token_refresh: + queries: + - "@app.post /refresh /token/refresh" + - "def refresh_token verify_token" + + protected_endpoint: + queries: + - "async def read_users_me token: str = Depends" + - "current_user = Depends(get_current_user)" + +token_features: + expiration: + queries: + - "exp datetime.utcnow" + - "expires_delta timedelta" + - "ACCESS_TOKEN_EXPIRE_MINUTES" + + claims: + queries: + - "sub username" + - "payload.get(\"sub\")" + - "to_encode.update exp iat" + + algorithms: + hmac: + queries: + - "HS256" + - "HS384" + - "HS512" + + rsa: + queries: + - "RS256" + - "RS384" + - "RS512" + +anti_patterns: + description: "Security anti-patterns to detect" + + weak_algorithm: + query: "algorithm = \"none\" jwt.encode" + severity: critical + + missing_expiration: + query: "jwt.encode -exp -expires" + severity: high + + hardcoded_secret: + query: "SECRET_KEY = \"my-secret-key\"" + severity: critical + + weak_secret: + query: "SECRET_KEY = \"secret\" \"test\"" + severity: critical + + no_algorithm_verification: + query: "jwt.decode -algorithms verify_signature=False" + severity: critical + +filters: + language: python + file_extension: "\\.py$" + exclude_tests: true + exclude_forks: true + exclude_archived: true + +search_strategy: + steps: + - name: "Find Enforcer" + queries: ["HTTPBearer", "OAuth2PasswordBearer"] + + - name: "Find Token Generator" + queries: ["create_access_token", "jwt.encode"] + + - name: "Find Verifier" + queries: ["jwt.decode", "verify_token"] + + - name: "Find Key Management" + queries: ["SECRET_KEY", "rsa.generate_private_key"] + + - name: "Verify Complete Implementation" + queries: ["HTTPBearer jwt.encode jwt.decode"] + +library_alternatives: + jwt_libraries: + - name: python-jose + queries: + - "from jose import jwt" + - "from jose import JWTError" + + - name: PyJWT + queries: + - "import jwt" + - "import jwt as PyJWT" + - "from jwt import encode, decode" diff --git a/security_pattern_miner/src/context_retriever/queries_loader.py b/security_pattern_miner/src/context_retriever/queries_loader.py new file mode 100644 index 0000000..a67a690 --- /dev/null +++ b/security_pattern_miner/src/context_retriever/queries_loader.py @@ -0,0 +1,61 @@ +import yaml +import jsonlines +from ..config.constants import GITHUB, FASTAPI +from ..config.queries_loader import QueriesLoaderConfig +from typing import List +from pydantic import BaseModel +import os + +class Query(BaseModel): + repo: str + role: str + query: str + webframework: str + pattern: str + +class QueriesLoader: + def __init__(self, language: str, + package_manager: str, + web_framework: str = FASTAPI, + pattern: str = "", + config: QueriesLoaderConfig = QueriesLoaderConfig + ): + self.language = language + self.package_manager = package_manager + self.web_framework = web_framework + self.yaml_path_postfix = f"{language}/{web_framework}/patterns/{pattern}.yaml" + self.pattern = pattern + self.queries: List[Query] = [] + self.config = config + + def load_from_pattern_metadata_file(self, file_path: str): + with open(file_path, 'r') as file: + self.metadata = yaml.safe_load(file) + + def load_roles(self) -> List[str]: + return self.metadata.get("roles", []) + + def load_repo_names(self, repo_meta_data_file_path: str) -> List[str]: + self.repo_names = [] + with jsonlines.open(repo_meta_data_file_path, "r") as repo_data: + for item in repo_data: + self.repo_names.append(f"{GITHUB}.com/{item.get('full_name')}") + return self.repo_names + + def process_query(self, query: str, repo) -> str: + return " ".join([query, f"lang:{self.language}", f"r:{repo}"]) + + def load_queries(self) -> List[Query]: + self.queries: List[Query] = [] + repo_meta_data_file = self.metadata.get("repo_metadata_file", [])[0] + repo_meta_data_file_path = os.path.join(self.config.repos_name_dir, repo_meta_data_file) + for repo in self.load_repo_names(repo_meta_data_file_path= repo_meta_data_file_path): + for role in self.load_roles(): + for query in role.get("queries", []): + self.queries.append(Query(repo=repo, role=role, query=self.process_query(query, repo), webframework=self.web_framework, pattern=self.pattern)) + return self.queries + + def save_queries_to_file(self, output_file_path: str): + with jsonlines.open(output_file_path, "w") as writer: + for query in self.queries: + writer.write(query.dict()) \ No newline at end of file diff --git a/security_pattern_miner/src/context_retriever/zoekt_retriever.py b/security_pattern_miner/src/context_retriever/zoekt_retriever.py new file mode 100644 index 0000000..e69de29 diff --git a/security_pattern_miner/src/runner.py b/security_pattern_miner/src/runner.py index 8d3dd79..7c2fccc 100644 --- a/security_pattern_miner/src/runner.py +++ b/security_pattern_miner/src/runner.py @@ -8,7 +8,8 @@ from config.constants import PYTHON, PYPI, JAVA, MAVEN from config.crawler import GitCrawlerConfig from config.libraries_io import LibrariesIOConfig - +from config.queries_loader import QueriesLoaderConfig +from context_retriever.queries_loader import QueriesLoader dependent_miners = { (PYTHON, PYPI): PythonDependentMiner, @@ -36,6 +37,16 @@ def __init__(self, args): LibrariesIOConfig.dependent_repo_info_save_dir = os.path.join(args.root_data_dir, "dependent_repos_info") GitCrawlerConfig.root_data_dir = args.root_data_dir GitCrawlerConfig.cloned_repos_dir = os.path.join(args.root_data_dir, "cloned_repos") + QueriesLoaderConfig.root_data_dir = args.root_data_dir + QueriesLoaderConfig.repos_name_dir = os.path.join(args.root_data_dir, "dependent_repos_info") + QueriesLoaderConfig.output_queries_dir = os.path.join(args.root_data_dir, "output_queries") + if args.construct_queries and args.pattern: + self.query_constructor = QueriesLoader( + language=args.language.lower(), + package_manager=args.package_manager, + pattern=args.pattern, + config=QueriesLoaderConfig + ) self.dependent_miner = dependent_miners.get((args.language.lower(), args.package_manager), None)(LibrariesIOConfig) self.repo_crawler = GitCrawler(GitCrawlerConfig) @@ -73,7 +84,14 @@ def run(self, package_names: list[str]): # Step 4: Crawl and clone the dependent repositories self.repo_crawler.crawl_from_dependent_repos_info(dependent_repos) logger.info("Completed crawling and cloning dependent repositories") - + + if self.args.construct_queries: + self.query_constructor.load_from_pattern_metadata_file( + file_path=os.path.join('./context/retriever/queries_library', self.query_constructor.yaml_path_postfix) + ) + queries = self.query_constructor.load_queries() + output_file_path = os.path.join(QueriesLoaderConfig.output_queries_dir, f"{self.args.language}_{self.args.package_manager}_mutual_dependents.jsonl") + self.query_constructor.save_queries_to_file() if __name__ == "__main__": import argparse @@ -94,6 +112,7 @@ def run(self, package_names: list[str]): parser.add_argument("--crawl_only", action="store_true", help="Flag to only crawl repositories from previously saved dependent info") parser.add_argument("--start_index", type=int, default=0, help="Start index for crawling repositories") parser.add_argument("--end_index", type=int, default=-1, help="End index for crawling repositories") + args = parser.parse_args() diff --git a/security_pattern_miner/tests/authentication.py b/security_pattern_miner/tests/authentication.py new file mode 100644 index 0000000..22a02c4 --- /dev/null +++ b/security_pattern_miner/tests/authentication.py @@ -0,0 +1 @@ +from fastapi.security import OAuth2PasswordBearer From 3567dec1da144e4b742aa6b8a0f03f951e737d4b Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Sat, 8 Nov 2025 21:37:23 +0000 Subject: [PATCH 11/26] tmp main file --- security_pattern_miner/src/runner.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/security_pattern_miner/src/runner.py b/security_pattern_miner/src/runner.py index 7c2fccc..eaa63bc 100644 --- a/security_pattern_miner/src/runner.py +++ b/security_pattern_miner/src/runner.py @@ -40,6 +40,11 @@ def __init__(self, args): QueriesLoaderConfig.root_data_dir = args.root_data_dir QueriesLoaderConfig.repos_name_dir = os.path.join(args.root_data_dir, "dependent_repos_info") QueriesLoaderConfig.output_queries_dir = os.path.join(args.root_data_dir, "output_queries") + + + self.dependent_miner = dependent_miners.get((args.language.lower(), args.package_manager), None)(LibrariesIOConfig) + self.repo_crawler = GitCrawler(GitCrawlerConfig) + if args.construct_queries and args.pattern: self.query_constructor = QueriesLoader( language=args.language.lower(), @@ -48,9 +53,6 @@ def __init__(self, args): config=QueriesLoaderConfig ) - self.dependent_miner = dependent_miners.get((args.language.lower(), args.package_manager), None)(LibrariesIOConfig) - self.repo_crawler = GitCrawler(GitCrawlerConfig) - def run(self, package_names: list[str]): if self.args.get_dependents: # Step 0 Get each package's dependents and save to files @@ -90,8 +92,9 @@ def run(self, package_names: list[str]): file_path=os.path.join('./context/retriever/queries_library', self.query_constructor.yaml_path_postfix) ) queries = self.query_constructor.load_queries() - output_file_path = os.path.join(QueriesLoaderConfig.output_queries_dir, f"{self.args.language}_{self.args.package_manager}_mutual_dependents.jsonl") - self.query_constructor.save_queries_to_file() + print(queries) + output_file_path = os.path.join(QueriesLoaderConfig.output_queries_dir, f"{self.args.pattern}_{self.args.web_framework}_queries.jsonl") + self.query_constructor.save_queries_to_file(output_file_path) if __name__ == "__main__": import argparse From 5df53c73b8416a3ab71a80dfe8c0961713bcd22e Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Sat, 8 Nov 2025 21:39:37 +0000 Subject: [PATCH 12/26] run query constructor --- security_pattern_miner/src/runner.py | 34 ++++++++++++++++------------ 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/security_pattern_miner/src/runner.py b/security_pattern_miner/src/runner.py index eaa63bc..e5df877 100644 --- a/security_pattern_miner/src/runner.py +++ b/security_pattern_miner/src/runner.py @@ -100,29 +100,33 @@ def run(self, package_names: list[str]): import argparse parser = argparse.ArgumentParser(description="Run the dependent miner and repo crawler") - # Libaries.io related arguments - parser.add_argument("--get_dependents", action="store_true", help="Flag to get dependents for the specified package names (from scratch)") + # # Libaries.io related arguments + # parser.add_argument("--get_dependents", action="store_true", help="Flag to get dependents for the specified package names (from scratch)") parser.add_argument("--language", type=str, default=PYTHON, help="Programming language") - parser.add_argument("--package_manager", type=str, default=PYPI, help="Package manager") - parser.add_argument("--package_names", type=str, nargs='+', required=True, help="List of package names to find mutual dependents") - parser.add_argument("--max_pages", type=int, default=2000, help="Maximum number of pages to fetch from Libraries.io") - parser.add_argument("--per_page", type=int, default=100, help="Number of results per page from Libraries.io") - parser.add_argument("--start_page", type=int, default=1, help="Starting page number for fetching dependents") + # parser.add_argument("--package_manager", type=str, default=PYPI, help="Package manager") + # parser.add_argument("--package_names", type=str, nargs='+', required=True, help="List of package names to find mutual dependents") + # parser.add_argument("--max_pages", type=int, default=2000, help="Maximum number of pages to fetch from Libraries.io") + # parser.add_argument("--per_page", type=int, default=100, help="Number of results per page from Libraries.io") + # parser.add_argument("--start_page", type=int, default=1, help="Starting page number for fetching dependents") parser.add_argument("--root_data_dir", type=str, default="/data", help="Directory to save dependent repository info") - parser.add_argument("--clean_only", action="store_true", help="Flag to only clean previously saved dependent info files and exit") - # Git crawler related arguments - parser.add_argument("--crawl_only", action="store_true", help="Flag to only crawl repositories from previously saved dependent info") - parser.add_argument("--start_index", type=int, default=0, help="Start index for crawling repositories") - parser.add_argument("--end_index", type=int, default=-1, help="End index for crawling repositories") + # parser.add_argument("--clean_only", action="store_true", help="Flag to only clean previously saved dependent info files and exit") + # # Git crawler related arguments + # parser.add_argument("--crawl_only", action="store_true", help="Flag to only crawl repositories from previously saved dependent info") + # parser.add_argument("--start_index", type=int, default=0, help="Start index for crawling repositories") + # parser.add_argument("--end_index", type=int, default=-1, help="End index for crawling repositories") + parser.add_argument("--construct_queries", action="store_true", help="Flag to construct queries based on the specified pattern") + parser.add_argument("--pattern", type=str, default="", help="Security pattern name for query construction") + parser.add_argument("--web_framework", type=str, default="fastapi", help="Web framework name for query construction") + # parser.add_argument("--root_data_dir", type=str, default="/ args = parser.parse_args() - pipeline = Pipeline(args) - # print(args.package_names) - pipeline.run(args.package_names) + # pipeline = Pipeline(args) + # # print(args.package_names) + # pipeline.run(args.package_names) # python_dependent_miner.get_dependents("flask") # python_dependent_miner.clean_saved_dependents("flask") From 0a485e736f085ed6c05e7981caae9a2c58f28d51 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Sat, 8 Nov 2025 22:32:36 +0000 Subject: [PATCH 13/26] run query constructor --- security_pattern_miner/requirements.txt | 3 +- .../password_based_authentication.yaml | 44 +------------- .../src/context_retriever/queries_loader.py | 14 ++--- security_pattern_miner/src/runner.py | 58 ++++++++++--------- 4 files changed, 41 insertions(+), 78 deletions(-) diff --git a/security_pattern_miner/requirements.txt b/security_pattern_miner/requirements.txt index 8e25d61..e6a6e41 100644 --- a/security_pattern_miner/requirements.txt +++ b/security_pattern_miner/requirements.txt @@ -3,4 +3,5 @@ requests # python-woc jsonlines tqdm -GitPython \ No newline at end of file +GitPython +pyYAML \ No newline at end of file diff --git a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml index 9295e83..707c5d0 100644 --- a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml +++ b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml @@ -9,7 +9,7 @@ dependencies: - cryptography repo_metadata_file: - - python_Pypi_mutual_dependents_fastapi_passlib_cryptography.jsonl + - python_Pypi_mutual_dependents_fastapi_passlib.jsonl roles: enforcer: @@ -190,45 +190,3 @@ endpoints: queries: - "@app.post /reset-password /forgot-password" - "reset_token send_email" - -anti_patterns: - description: "Security anti-patterns to detect" - weak_password_storage: - query: "password = User.password -hash -bcrypt -crypt" - severity: critical - - hardcoded_secrets: - query: "SECRET_KEY = \"" - severity: critical - - no_https: - query: "set_cookie -secure -httponly" - severity: high - - weak_random: - query: "random.randint -secrets -uuid4" - severity: high - -filters: - language: python - file_extension: "\\.py$" - exclude_tests: true - exclude_forks: true - exclude_archived: true - -search_strategy: - steps: - - name: "Find Enforcer" - queries: ["OAuth2PasswordBearer", "HTTPBasic"] - - - name: "Find Verification Logic" - queries: ["pwd_context.verify", "authenticate_user"] - - - name: "Find Password Storage" - queries: ["class User", "hashed_password Column"] - - - name: "Find Registration" - queries: ["@app.post /register", "pwd_context.hash db.add"] - - - name: "Verify Complete Implementation" - queries: ["OAuth2PasswordBearer CryptContext pwd_context"] diff --git a/security_pattern_miner/src/context_retriever/queries_loader.py b/security_pattern_miner/src/context_retriever/queries_loader.py index a67a690..83e6922 100644 --- a/security_pattern_miner/src/context_retriever/queries_loader.py +++ b/security_pattern_miner/src/context_retriever/queries_loader.py @@ -1,7 +1,8 @@ import yaml import jsonlines -from ..config.constants import GITHUB, FASTAPI -from ..config.queries_loader import QueriesLoaderConfig +import sys; sys.path.append("..") +from config.constants import GITHUB, FASTAPI +from config.queries_loader import QueriesLoaderConfig from typing import List from pydantic import BaseModel import os @@ -15,13 +16,11 @@ class Query(BaseModel): class QueriesLoader: def __init__(self, language: str, - package_manager: str, web_framework: str = FASTAPI, pattern: str = "", config: QueriesLoaderConfig = QueriesLoaderConfig ): self.language = language - self.package_manager = package_manager self.web_framework = web_framework self.yaml_path_postfix = f"{language}/{web_framework}/patterns/{pattern}.yaml" self.pattern = pattern @@ -40,9 +39,10 @@ def load_repo_names(self, repo_meta_data_file_path: str) -> List[str]: with jsonlines.open(repo_meta_data_file_path, "r") as repo_data: for item in repo_data: self.repo_names.append(f"{GITHUB}.com/{item.get('full_name')}") + # print(self.repo_names) return self.repo_names - def process_query(self, query: str, repo) -> str: + def process_query(self, query: str, repo : str) -> str: return " ".join([query, f"lang:{self.language}", f"r:{repo}"]) def load_queries(self) -> List[Query]: @@ -51,8 +51,8 @@ def load_queries(self) -> List[Query]: repo_meta_data_file_path = os.path.join(self.config.repos_name_dir, repo_meta_data_file) for repo in self.load_repo_names(repo_meta_data_file_path= repo_meta_data_file_path): for role in self.load_roles(): - for query in role.get("queries", []): - self.queries.append(Query(repo=repo, role=role, query=self.process_query(query, repo), webframework=self.web_framework, pattern=self.pattern)) + for query in self.metadata["roles"][role].get("queries", []): + self.queries.append(Query(repo=repo, role=role, query=self.process_query(query['query'], repo), webframework=self.web_framework, pattern=self.pattern)) return self.queries def save_queries_to_file(self, output_file_path: str): diff --git a/security_pattern_miner/src/runner.py b/security_pattern_miner/src/runner.py index e5df877..12f1853 100644 --- a/security_pattern_miner/src/runner.py +++ b/security_pattern_miner/src/runner.py @@ -22,33 +22,33 @@ def __init__(self, args): # raise ValueError("Currently, only Python language with PyPI package manager is supported.") self.args = args - if args.max_pages: - LibrariesIOConfig.max_num_pages = args.max_pages - if args.per_page: - LibrariesIOConfig.max_per_page = args.per_page - if args.start_page: - LibrariesIOConfig.start_page = args.start_page - if args.start_index is not None: - GitCrawlerConfig.start_index = args.start_index - if args.end_index is not None: - GitCrawlerConfig.end_index = args.end_index - if args.root_data_dir: - LibrariesIOConfig.root_data_dir = args.root_data_dir - LibrariesIOConfig.dependent_repo_info_save_dir = os.path.join(args.root_data_dir, "dependent_repos_info") - GitCrawlerConfig.root_data_dir = args.root_data_dir - GitCrawlerConfig.cloned_repos_dir = os.path.join(args.root_data_dir, "cloned_repos") - QueriesLoaderConfig.root_data_dir = args.root_data_dir - QueriesLoaderConfig.repos_name_dir = os.path.join(args.root_data_dir, "dependent_repos_info") - QueriesLoaderConfig.output_queries_dir = os.path.join(args.root_data_dir, "output_queries") - - - self.dependent_miner = dependent_miners.get((args.language.lower(), args.package_manager), None)(LibrariesIOConfig) - self.repo_crawler = GitCrawler(GitCrawlerConfig) + if not args.construct_queries: + if args.max_pages: + LibrariesIOConfig.max_num_pages = args.max_pages + if args.per_page: + LibrariesIOConfig.max_per_page = args.per_page + if args.start_page: + LibrariesIOConfig.start_page = args.start_page + if args.start_index is not None: + GitCrawlerConfig.start_index = args.start_index + if args.end_index is not None: + GitCrawlerConfig.end_index = args.end_index + if args.root_data_dir: + LibrariesIOConfig.root_data_dir = args.root_data_dir + LibrariesIOConfig.dependent_repo_info_save_dir = os.path.join(args.root_data_dir, "dependent_repos_info") + GitCrawlerConfig.root_data_dir = args.root_data_dir + GitCrawlerConfig.cloned_repos_dir = os.path.join(args.root_data_dir, "cloned_repos") + QueriesLoaderConfig.root_data_dir = args.root_data_dir + QueriesLoaderConfig.repos_name_dir = os.path.join(args.root_data_dir, "dependent_repos_info") + QueriesLoaderConfig.output_queries_dir = os.path.join(args.root_data_dir, "output_queries") + + + self.dependent_miner = dependent_miners.get((args.language.lower(), args.package_manager), None)(LibrariesIOConfig) + self.repo_crawler = GitCrawler(GitCrawlerConfig) if args.construct_queries and args.pattern: self.query_constructor = QueriesLoader( language=args.language.lower(), - package_manager=args.package_manager, pattern=args.pattern, config=QueriesLoaderConfig ) @@ -86,10 +86,12 @@ def run(self, package_names: list[str]): # Step 4: Crawl and clone the dependent repositories self.repo_crawler.crawl_from_dependent_repos_info(dependent_repos) logger.info("Completed crawling and cloning dependent repositories") - + + + def construct_queries(self): if self.args.construct_queries: self.query_constructor.load_from_pattern_metadata_file( - file_path=os.path.join('./context/retriever/queries_library', self.query_constructor.yaml_path_postfix) + file_path=os.path.join('./context_retriever/queries_library', self.query_constructor.yaml_path_postfix) ) queries = self.query_constructor.load_queries() print(queries) @@ -124,7 +126,8 @@ def run(self, package_names: list[str]): - # pipeline = Pipeline(args) + pipeline = Pipeline(args) + pipeline.construct_queries() # # print(args.package_names) # pipeline.run(args.package_names) # python_dependent_miner.get_dependents("flask") @@ -150,4 +153,5 @@ def run(self, package_names: list[str]): # from repo_crawler.base import GitCrawler # git_crawler = GitCrawler() # dependent_repos = git_crawler.load_dependedent_repos_info(saved_jsonl_path) - # git_crawler.crawl_from_dependent_repos_info(dependent_repos) \ No newline at end of file + # git_crawler.crawl_from_dependent_repos_info(dependent_repos) + \ No newline at end of file From f22ce9d112e94ff34d7d22840538c27958933ed0 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Sat, 8 Nov 2025 22:41:53 +0000 Subject: [PATCH 14/26] fix docker compose --- docker-compose.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index addb2a5..329f805 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,6 +18,25 @@ services: cpus: '4.0' memory: 4G + security_pattern_extractor: + build: + context: ./security_pattern_miner + dockerfile: securityPatternMiner.dockerfile + image: security_pattern_miner:latest + volumes: + - ./build/volumes/data:/data + env_file: + - ./.env + command: ["python", "./runner.py", "--construct_queries", "--pattern", "password_based_authentication", "--web_framework", "fastapi", "--language", "python", "--root_data_dir=/data"] + deploy: + resources: + limits: + cpus: '8.0' + memory: 10G + reservations: + cpus: '4.0' + memory: 4G + zoekt-webserver: image: zoekt-local ports: From d173deab9cb82492ada2311f3a7105fcb52914a1 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Sat, 8 Nov 2025 22:59:13 +0000 Subject: [PATCH 15/26] refactor: update docker-compose image definition and improve runner.py structure --- docker-compose.yml | 2 +- security_pattern_miner/src/runner.py | 201 +++++++++++++++------------ 2 files changed, 112 insertions(+), 91 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 329f805..ede8b09 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -22,7 +22,7 @@ services: build: context: ./security_pattern_miner dockerfile: securityPatternMiner.dockerfile - image: security_pattern_miner:latest + image: security_pattern_miner:latest volumes: - ./build/volumes/data:/data env_file: diff --git a/security_pattern_miner/src/runner.py b/security_pattern_miner/src/runner.py index 12f1853..34c31c4 100644 --- a/security_pattern_miner/src/runner.py +++ b/security_pattern_miner/src/runner.py @@ -16,46 +16,37 @@ (JAVA, MAVEN): JavaDependentMiner } -class Pipeline: +class SecurityPatternMiner: + """Handles mining of dependent repositories""" def __init__(self, args): - # if args.language.lower() != PYTHON or args.package_manager != PYPI: - # raise ValueError("Currently, only Python language with PyPI package manager is supported.") - self.args = args - if not args.construct_queries: - if args.max_pages: - LibrariesIOConfig.max_num_pages = args.max_pages - if args.per_page: - LibrariesIOConfig.max_per_page = args.per_page - if args.start_page: - LibrariesIOConfig.start_page = args.start_page - if args.start_index is not None: - GitCrawlerConfig.start_index = args.start_index - if args.end_index is not None: - GitCrawlerConfig.end_index = args.end_index - if args.root_data_dir: - LibrariesIOConfig.root_data_dir = args.root_data_dir - LibrariesIOConfig.dependent_repo_info_save_dir = os.path.join(args.root_data_dir, "dependent_repos_info") - GitCrawlerConfig.root_data_dir = args.root_data_dir - GitCrawlerConfig.cloned_repos_dir = os.path.join(args.root_data_dir, "cloned_repos") - QueriesLoaderConfig.root_data_dir = args.root_data_dir - QueriesLoaderConfig.repos_name_dir = os.path.join(args.root_data_dir, "dependent_repos_info") - QueriesLoaderConfig.output_queries_dir = os.path.join(args.root_data_dir, "output_queries") - - - self.dependent_miner = dependent_miners.get((args.language.lower(), args.package_manager), None)(LibrariesIOConfig) - self.repo_crawler = GitCrawler(GitCrawlerConfig) - - if args.construct_queries and args.pattern: - self.query_constructor = QueriesLoader( - language=args.language.lower(), - pattern=args.pattern, - config=QueriesLoaderConfig - ) + + # Configure directories and limits + if args.max_pages: + LibrariesIOConfig.max_num_pages = args.max_pages + if args.per_page: + LibrariesIOConfig.max_per_page = args.per_page + if args.start_page: + LibrariesIOConfig.start_page = args.start_page + if args.start_index is not None: + GitCrawlerConfig.start_index = args.start_index + if args.end_index is not None: + GitCrawlerConfig.end_index = args.end_index + if args.root_data_dir: + LibrariesIOConfig.root_data_dir = args.root_data_dir + LibrariesIOConfig.dependent_repo_info_save_dir = os.path.join(args.root_data_dir, "dependent_repos_info") + GitCrawlerConfig.root_data_dir = args.root_data_dir + GitCrawlerConfig.cloned_repos_dir = os.path.join(args.root_data_dir, "cloned_repos") + + self.dependent_miner = dependent_miners.get((args.language.lower(), args.package_manager), None)(LibrariesIOConfig) + if not self.dependent_miner: + raise ValueError(f"Unsupported language/package manager combination: {args.language}/{args.package_manager}") + + self.repo_crawler = GitCrawler(GitCrawlerConfig) def run(self, package_names: list[str]): if self.args.get_dependents: - # Step 0 Get each package's dependents and save to files + # Step 0: Get each package's dependents and save to files for pkg in package_names: self.dependent_miner.get_dependents(pkg) self.dependent_miner.clean_saved_dependents(pkg) @@ -88,70 +79,100 @@ def run(self, package_names: list[str]): logger.info("Completed crawling and cloning dependent repositories") +class SecurityPatternExtractor: + """Handles construction of security pattern queries""" + def __init__(self, args): + self.args = args + + # Configure directories + if args.root_data_dir: + QueriesLoaderConfig.root_data_dir = args.root_data_dir + QueriesLoaderConfig.repos_name_dir = os.path.join(args.root_data_dir, "dependent_repos_info") + QueriesLoaderConfig.output_queries_dir = os.path.join(args.root_data_dir, "output_queries") + + if not args.pattern: + raise ValueError("Pattern is required for query construction") + + self.query_constructor = QueriesLoader( + language=args.language.lower(), + pattern=args.pattern, + config=QueriesLoaderConfig + ) + def construct_queries(self): - if self.args.construct_queries: - self.query_constructor.load_from_pattern_metadata_file( - file_path=os.path.join('./context_retriever/queries_library', self.query_constructor.yaml_path_postfix) - ) - queries = self.query_constructor.load_queries() - print(queries) - output_file_path = os.path.join(QueriesLoaderConfig.output_queries_dir, f"{self.args.pattern}_{self.args.web_framework}_queries.jsonl") - self.query_constructor.save_queries_to_file(output_file_path) + self.query_constructor.load_from_pattern_metadata_file( + file_path=os.path.join('./context_retriever/queries_library', self.query_constructor.yaml_path_postfix) + ) + queries = self.query_constructor.load_queries() + print(queries) + + # Ensure output directory exists + os.makedirs(QueriesLoaderConfig.output_queries_dir, exist_ok=True) + + output_file_path = os.path.join(QueriesLoaderConfig.output_queries_dir, f"{self.args.pattern}_{self.args.web_framework}_queries.jsonl") + self.query_constructor.save_queries_to_file(output_file_path) + logger.info(f"Queries saved to {output_file_path}") -if __name__ == "__main__": - import argparse - parser = argparse.ArgumentParser(description="Run the dependent miner and repo crawler") + +def create_miner_parser(): + """Create argument parser for mining functionality""" + parser = argparse.ArgumentParser(description="Mine dependent repositories") - # # Libaries.io related arguments - # parser.add_argument("--get_dependents", action="store_true", help="Flag to get dependents for the specified package names (from scratch)") + # Libraries.io related arguments + parser.add_argument("--get_dependents", action="store_true", help="Flag to get dependents for the specified package names (from scratch)") parser.add_argument("--language", type=str, default=PYTHON, help="Programming language") - # parser.add_argument("--package_manager", type=str, default=PYPI, help="Package manager") - # parser.add_argument("--package_names", type=str, nargs='+', required=True, help="List of package names to find mutual dependents") - # parser.add_argument("--max_pages", type=int, default=2000, help="Maximum number of pages to fetch from Libraries.io") - # parser.add_argument("--per_page", type=int, default=100, help="Number of results per page from Libraries.io") - # parser.add_argument("--start_page", type=int, default=1, help="Starting page number for fetching dependents") + parser.add_argument("--package_manager", type=str, default=PYPI, help="Package manager") + parser.add_argument("--package_names", type=str, nargs='+', required=True, help="List of package names to find mutual dependents") + parser.add_argument("--max_pages", type=int, default=2000, help="Maximum number of pages to fetch from Libraries.io") + parser.add_argument("--per_page", type=int, default=100, help="Number of results per page from Libraries.io") + parser.add_argument("--start_page", type=int, default=1, help="Starting page number for fetching dependents") parser.add_argument("--root_data_dir", type=str, default="/data", help="Directory to save dependent repository info") + parser.add_argument("--clean_only", action="store_true", help="Flag to only clean previously saved dependent info files and exit") + + # Git crawler related arguments + parser.add_argument("--crawl_only", action="store_true", help="Flag to only crawl repositories from previously saved dependent info") + parser.add_argument("--start_index", type=int, default=0, help="Start index for crawling repositories") + parser.add_argument("--end_index", type=int, default=-1, help="End index for crawling repositories") - # parser.add_argument("--clean_only", action="store_true", help="Flag to only clean previously saved dependent info files and exit") - # # Git crawler related arguments - # parser.add_argument("--crawl_only", action="store_true", help="Flag to only crawl repositories from previously saved dependent info") - # parser.add_argument("--start_index", type=int, default=0, help="Start index for crawling repositories") - # parser.add_argument("--end_index", type=int, default=-1, help="End index for crawling repositories") + return parser + + +def create_extractor_parser(): + """Create argument parser for query extraction functionality""" + parser = argparse.ArgumentParser(description="Extract security pattern queries") parser.add_argument("--construct_queries", action="store_true", help="Flag to construct queries based on the specified pattern") - parser.add_argument("--pattern", type=str, default="", help="Security pattern name for query construction") + parser.add_argument("--pattern", type=str, required=True, help="Security pattern name for query construction") parser.add_argument("--web_framework", type=str, default="fastapi", help="Web framework name for query construction") - # parser.add_argument("--root_data_dir", type=str, default="/ - args = parser.parse_args() - - - - pipeline = Pipeline(args) - pipeline.construct_queries() - # # print(args.package_names) - # pipeline.run(args.package_names) - # python_dependent_miner.get_dependents("flask") - # python_dependent_miner.clean_saved_dependents("flask") + parser.add_argument("--language", type=str, default=PYTHON, help="Programming language") + parser.add_argument("--root_data_dir", type=str, default="/data", help="Directory to save query outputs") - # # Token-based authentication - # token_based_auth_repos = python_dependent_miner.find_mutual_dependents(["fastapi", "pyjwt"]) - # python_dependent_miner.save_mutual_dependents(["fastapi", "pyjwt"], token_based_auth_repos) - # # Password-based authentication - # password_based_auth_repos = python_dependent_miner.find_mutual_dependents(["fastapi", "passlib"]) - # python_dependent_miner.save_mutual_dependents(["fastapi", "passlib"], password_based_auth_repos) - - - # # Password-based and token-based authentication - # password_based_auth_repos = python_dependent_miner.find_mutual_dependents(["fastapi", "passlib", "pyjwt"]) - # saved_jsonl_path = python_dependent_miner.save_mutual_dependents(["fastapi", "passlib", "pyjwt"], password_based_auth_repos) - - # # Password-based and token-based authentication - # password_based_auth_repos = python_dependent_miner.find_mutual_dependents(["fastapi", "passlib", "pyjwt"]) - # saved_jsonl_path = python_dependent_miner.save_mutual_dependents(["fastapi", "passlib", "pyjwt"], password_based_auth_repos) + return parser - # from repo_crawler.base import GitCrawler - # git_crawler = GitCrawler() - # dependent_repos = git_crawler.load_dependedent_repos_info(saved_jsonl_path) - # git_crawler.crawl_from_dependent_repos_info(dependent_repos) - \ No newline at end of file +if __name__ == "__main__": + import argparse + import sys + + # Check if this is being run for mining or extracting + if "--get_dependents" in sys.argv or "--crawl_only" in sys.argv or "--clean_only" in sys.argv: + # Mining mode + parser = create_miner_parser() + args = parser.parse_args() + + miner = SecurityPatternMiner(args) + miner.run(args.package_names) + + elif "--construct_queries" in sys.argv: + # Extracting mode + parser = create_extractor_parser() + args = parser.parse_args() + + extractor = SecurityPatternExtractor(args) + extractor.construct_queries() + + else: + print("Error: Please specify either mining arguments (--get_dependents, --crawl_only, --clean_only) or extraction arguments (--construct_queries)") + print("For mining: use --get_dependents, --crawl_only, or --clean_only") + print("For extraction: use --construct_queries") + sys.exit(1) From 13158d0f0e35e2a69ca95b0b9efbd4c4a8fdbbfc Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Sun, 9 Nov 2025 01:20:33 +0000 Subject: [PATCH 16/26] tmp retriever --- security_pattern_miner/src/config/zoekt.py | 15 ++ .../src/context_retriever/zoekt_retriever.py | 130 ++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 security_pattern_miner/src/config/zoekt.py diff --git a/security_pattern_miner/src/config/zoekt.py b/security_pattern_miner/src/config/zoekt.py new file mode 100644 index 0000000..ae8cae6 --- /dev/null +++ b/security_pattern_miner/src/config/zoekt.py @@ -0,0 +1,15 @@ +import sys; sys.path.append("..") +from config.constants import NUM_CONTEXT_LINES +from enum import Enum +import os + +class ZoektConfig(): + """ + Configuration class for search settings. + """ + num_context_lines: int = os.getenv('NUM_CONTEXT_LINES', NUM_CONTEXT_LINES) + max_results: int = os.getenv('MAX_RESULTS', 10) + max_retries: int = os.getenv('MAX_RETRIES', 3) + retry_delay: float = os.getenv('RETRY_DELAY', 0.2) + zoekt_url: str = os.getenv('ZOEKT_URL', 'http://localhost:6070/api/search') + max_candidates_used: int = os.getenv('MAX_CANDIDATES_USED', 10) diff --git a/security_pattern_miner/src/context_retriever/zoekt_retriever.py b/security_pattern_miner/src/context_retriever/zoekt_retriever.py index e69de29..0a8032f 100644 --- a/security_pattern_miner/src/context_retriever/zoekt_retriever.py +++ b/security_pattern_miner/src/context_retriever/zoekt_retriever.py @@ -0,0 +1,130 @@ +import sys; sys.path.append("..") +from config.zoekt import ZoektConfig +from context_retriever.queries_loader import Query +import requests +from urllib3.exceptions import NewConnectionError +from requests.exceptions import ConnectionError, Timeout , RequestException +import json +import time +from logging import getLogger + +logger = getLogger(__name__) + +class ZoektSearchRequester: + """ + A class to handle search requests to Zoekt. + """ + + def __init__(self, config: ZoektConfig): + self.config = config + self.num_successful_searches = 0 + self.num_failed_searches = 0 + + def zoekt_search_on_query_point( + self, + query_point: Query): + query = query_point.query + result = self.zoekt_search_request(query) + if result and "Result" in result and "Files" in result["Result"]: + files = result["Result"]["Files"] + if files: + logger.info(f"Found {len(files)} files for query: {query}") + self.num_successful_searches += 1 + return result + count += 1 + self.num_failed_searches += 1 + return {"Result": {"Files": [], "FileCount": 0}} + + def zoekt_search_request( + self, + query: str, + ) -> dict: + """ + Make a request to the zoekt search API with error handling and retry logic. + + Args: + query: Search query string + num_context_lines: Number of context lines to include + max_results: Maximum number of results to return + max_retries: Maximum number of retry attempts + retry_delay: Delay between retries in seconds + + Returns: + Dict containing search results or empty result on failure + """ + if query is None or query.strip() == "": + print("Empty query provided. Returning empty result.") + return {"Result": {"Files": [], "FileCount": 0}} + + url = self.config.zoekt_url + payload = json.dumps({ + "Q": query, + "Opts": { + "NumContextLines": self.config.num_context_lines, + "MaxResults": self.config.max_results, + } + }) + headers = { + 'Content-Type': 'application/json' + } + + for attempt in range(self.config.max_retries + 1): + try: + response = requests.request("POST", url, headers=headers, data=payload, timeout=30) + + # Check if response is successful + if response.status_code == 200: + # print(response.json()) + return response.json() + else: + logger.error(f"HTTP {response.status_code} error: {response.text}") + if attempt < self.config.max_retries: + logger.info(f"Retrying in {self.config.retry_delay} seconds... (attempt {attempt + 1}/{self.config.max_retries})") + time.sleep(self.config.retry_delay) + continue + else: + logger.info("Max retries reached. Returning empty result.") + return {"Result": {"Files": [], "FileCount": 0}} + + except (ConnectionError, NewConnectionError) as e: + logger.error(f"Connection error on attempt {attempt + 1}: {e}") + if attempt < self.config.max_retries: + logger.info(f"Zoekt service might be down. Retrying in {self.config.retry_delay} seconds...") + time.sleep(self.config.retry_delay) + else: + logger.info("Failed to connect to Zoekt service after all retries.") + print("Please check if Zoekt is running on http://localhost:6070") + return {"Result": {"Files": [], "FileCount": 0}} + + except Timeout as e: + logger.error(f"Request timeout on attempt {attempt + 1}: {e}") + if attempt < self.config.max_retries: + logger.info(f"Retrying in {self.config.retry_delay} seconds...") + time.sleep(self.config.retry_delay) + else: + logger.info("Request timed out after all retries.") + return {"Result": {"Files": [], "FileCount": 0}} + + except RequestException as e: + logger.error(f"Request error on attempt {attempt + 1}: {e}") + if attempt < self.config.max_retries: + logger.info(f"Retrying in {self.config.retry_delay} seconds...") + time.sleep(self.config.retry_delay) + else: + logger.error("Request failed after all retries.") + return {"Result": {"Files": [], "FileCount": 0}} + + except json.JSONDecodeError as e: + logger.error(f"JSON decode error: {e}") + logger.error(f"Response content: {response.text if 'response' in locals() else 'No response'}") + return {"Result": {"Files": [], "FileCount": 0}} + + except Exception as e: + logger.error(f"Unexpected error: {e}") + return {"Result": {"Files": [], "FileCount": 0}} + + # This should never be reached, but just in case + return {"Result": {"Files": [], "FileCount": 0}} + + + \ No newline at end of file From e0eb0e1c06d012ec6db1401fe02a6630ed8718bb Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Sun, 9 Nov 2025 02:30:36 +0000 Subject: [PATCH 17/26] tmp retriever --- security_pattern_miner/src/config/zoekt.py | 4 ++ .../src/context_retriever/zoekt_retriever.py | 62 ++++++++++++++++--- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/security_pattern_miner/src/config/zoekt.py b/security_pattern_miner/src/config/zoekt.py index ae8cae6..20cf6ad 100644 --- a/security_pattern_miner/src/config/zoekt.py +++ b/security_pattern_miner/src/config/zoekt.py @@ -13,3 +13,7 @@ class ZoektConfig(): retry_delay: float = os.getenv('RETRY_DELAY', 0.2) zoekt_url: str = os.getenv('ZOEKT_URL', 'http://localhost:6070/api/search') max_candidates_used: int = os.getenv('MAX_CANDIDATES_USED', 10) + top_k_files: int = os.getenv('TOP_K_FILES', 5) + get_whole_file: bool = os.getenv('GET_WHOLE_FILE', False) + root_data_dir: str = os.getenv('ROOT_DATA_DIR', '/data') + cloned_repos_dir: str = os.path.join(root_data_dir, 'cloned_repos') \ No newline at end of file diff --git a/security_pattern_miner/src/context_retriever/zoekt_retriever.py b/security_pattern_miner/src/context_retriever/zoekt_retriever.py index 0a8032f..81493b5 100644 --- a/security_pattern_miner/src/context_retriever/zoekt_retriever.py +++ b/security_pattern_miner/src/context_retriever/zoekt_retriever.py @@ -1,4 +1,9 @@ -import sys; sys.path.append("..") +import os +import sys + +sys.path.append("..") +from typing import List +from pydantic import BaseModel from config.zoekt import ZoektConfig from context_retriever.queries_loader import Query import requests @@ -7,9 +12,20 @@ import json import time from logging import getLogger - +from typing import List +from base64 import decodebytes logger = getLogger(__name__) +class Context(BaseModel): + filepath: str + start_line: int + end_line: int + snippet: str + +class SearchedResponse(Query): + success: bool + contexts: List[Context] = [] + class ZoektSearchRequester: """ A class to handle search requests to Zoekt. @@ -17,8 +33,6 @@ class ZoektSearchRequester: def __init__(self, config: ZoektConfig): self.config = config - self.num_successful_searches = 0 - self.num_failed_searches = 0 def zoekt_search_on_query_point( self, @@ -29,11 +43,10 @@ def zoekt_search_on_query_point( files = result["Result"]["Files"] if files: logger.info(f"Found {len(files)} files for query: {query}") - self.num_successful_searches += 1 - return result - count += 1 - self.num_failed_searches += 1 - return {"Result": {"Files": [], "FileCount": 0}} + return files + # count += 1 + # self.num_failed_searches += 1 + return [] def zoekt_search_request( self, @@ -126,5 +139,34 @@ def zoekt_search_request( # This should never be reached, but just in case return {"Result": {"Files": [], "FileCount": 0}} - + def handle_file_path( filepath: str) -> str: + project_metadata, navigation_path = filepath.split(":", 1) + project_metadata = project_metadata.replace("/", "_").replace("github.com_", "") + return os.path.join(project_metadata, navigation_path) + def post_process_search_results( + self, + files: list, + query_point: Query) -> SearchedResponse: + searched_response = SearchedResponse() + searched_response.query = query_point.query + searched_response.success = False + contexts = [] + for file in files: + if "LineMatches" in file: + line_matches = file["LineMatches"] + for line_match in line_matches: + context = Context() + context.filepath = file.get("FileName", "") + context.start_line = max(0, line_match['LineNumber'] - self.config.num_context_lines - 1) + context.end_line = line_match['LineNumber'] + self.config.num_context_lines + if self.config.get_whole_file: + with open(os.path.join(self.config.cloned_repos_dir, context.filepath), 'r') as f: + context.snippet = f.read() + else: + context.snippet = decodebytes(line_match['Context'].encode()).decode('utf-8', errors='ignore') + contexts.append(context) + if contexts: + searched_response.success = True + searched_response.contexts = contexts + return searched_response \ No newline at end of file From 64951ebd04e81593725caf3f0510a716d7b39d7d Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Sun, 9 Nov 2025 02:47:57 +0000 Subject: [PATCH 18/26] feat: enhance Zoekt integration with search and result saving functionality --- .../src/config/constants.py | 2 +- security_pattern_miner/src/config/zoekt.py | 17 ++-- .../src/context_retriever/zoekt_retriever.py | 92 ++++++++++++++++--- security_pattern_miner/src/runner.py | 56 ++++++++++- 4 files changed, 143 insertions(+), 24 deletions(-) diff --git a/security_pattern_miner/src/config/constants.py b/security_pattern_miner/src/config/constants.py index 1f172ef..ea59c62 100644 --- a/security_pattern_miner/src/config/constants.py +++ b/security_pattern_miner/src/config/constants.py @@ -15,4 +15,4 @@ OBSCURE_TOKEN_ACCESS_CONTROL = "obscure_token_access_control" SESSION_BASED_ACCESS_CONTROL = "session_based_access_control" - +NUM_CONTEXT_LINES = 20 diff --git a/security_pattern_miner/src/config/zoekt.py b/security_pattern_miner/src/config/zoekt.py index 20cf6ad..9273413 100644 --- a/security_pattern_miner/src/config/zoekt.py +++ b/security_pattern_miner/src/config/zoekt.py @@ -7,13 +7,14 @@ class ZoektConfig(): """ Configuration class for search settings. """ - num_context_lines: int = os.getenv('NUM_CONTEXT_LINES', NUM_CONTEXT_LINES) - max_results: int = os.getenv('MAX_RESULTS', 10) - max_retries: int = os.getenv('MAX_RETRIES', 3) - retry_delay: float = os.getenv('RETRY_DELAY', 0.2) + num_context_lines: int = int(os.getenv('NUM_CONTEXT_LINES', NUM_CONTEXT_LINES)) + max_results: int = int(os.getenv('MAX_RESULTS', 10)) + max_retries: int = int(os.getenv('MAX_RETRIES', 3)) + retry_delay: float = float(os.getenv('RETRY_DELAY', 0.2)) zoekt_url: str = os.getenv('ZOEKT_URL', 'http://localhost:6070/api/search') - max_candidates_used: int = os.getenv('MAX_CANDIDATES_USED', 10) - top_k_files: int = os.getenv('TOP_K_FILES', 5) - get_whole_file: bool = os.getenv('GET_WHOLE_FILE', False) + max_candidates_used: int = int(os.getenv('MAX_CANDIDATES_USED', 10)) + top_k_files: int = int(os.getenv('TOP_K_FILES', 5)) + get_whole_file: bool = os.getenv('GET_WHOLE_FILE', 'False').lower() == 'true' root_data_dir: str = os.getenv('ROOT_DATA_DIR', '/data') - cloned_repos_dir: str = os.path.join(root_data_dir, 'cloned_repos') \ No newline at end of file + cloned_repos_dir: str = os.path.join(root_data_dir, 'cloned_repos') + search_results_dir: str = os.path.join(root_data_dir, 'search_results') \ No newline at end of file diff --git a/security_pattern_miner/src/context_retriever/zoekt_retriever.py b/security_pattern_miner/src/context_retriever/zoekt_retriever.py index 81493b5..d106109 100644 --- a/security_pattern_miner/src/context_retriever/zoekt_retriever.py +++ b/security_pattern_miner/src/context_retriever/zoekt_retriever.py @@ -139,34 +139,102 @@ def zoekt_search_request( # This should never be reached, but just in case return {"Result": {"Files": [], "FileCount": 0}} - def handle_file_path( filepath: str) -> str: - project_metadata, navigation_path = filepath.split(":", 1) + @staticmethod + def handle_file_path(filepath: str) -> str: + project_metadata, navigation_path = filepath.split(":", 1) project_metadata = project_metadata.replace("/", "_").replace("github.com_", "") return os.path.join(project_metadata, navigation_path) + def post_process_search_results( self, files: list, query_point: Query) -> SearchedResponse: - searched_response = SearchedResponse() - searched_response.query = query_point.query - searched_response.success = False + searched_response = SearchedResponse( + repo=query_point.repo, + role=query_point.role, + query=query_point.query, + webframework=query_point.webframework, + pattern=query_point.pattern, + success=False, + contexts=[] + ) contexts = [] for file in files: if "LineMatches" in file: line_matches = file["LineMatches"] for line_match in line_matches: - context = Context() + print(line_match) + context = Context( + filepath="", + start_line=0, + end_line=0, + snippet="" + ) context.filepath = file.get("FileName", "") - context.start_line = max(0, line_match['LineNumber'] - self.config.num_context_lines - 1) - context.end_line = line_match['LineNumber'] + self.config.num_context_lines + context.start_line = line_match['LineStart'] + context.end_line = line_match['LineEnd'] if self.config.get_whole_file: - with open(os.path.join(self.config.cloned_repos_dir, context.filepath), 'r') as f: - context.snippet = f.read() + # try: + processed_filepath = self.handle_file_path(context.filepath) + full_path = os.path.join(self.config.cloned_repos_dir, processed_filepath) + with open(full_path, 'r', encoding='utf-8', errors='ignore') as f: + context.snippet = f.read() + # except Exception as e: + # logger.error(f"Error reading file {context.filepath}: {e}") + # context.snippet = decodebytes(line_match['Content'].encode()).decode('utf-8', errors='ignore') else: - context.snippet = decodebytes(line_match['Context'].encode()).decode('utf-8', errors='ignore') + before, current, after = line_match['Before'], line_match['Line'], line_match['After'] + context.snippet = decodebytes((before + current + after).encode()).decode('utf-8', errors='ignore') contexts.append(context) if contexts: searched_response.success = True searched_response.contexts = contexts return searched_response - \ No newline at end of file + + def save_search_results_to_file(self, search_results: List[SearchedResponse], output_file_path: str): + """ + Save processed search results to a JSONL file. + + Args: + search_results: List of SearchedResponse objects + output_file_path: Path to the output JSONL file + """ + import jsonlines + + # Ensure output directory exists + os.makedirs(os.path.dirname(output_file_path), exist_ok=True) + + with jsonlines.open(output_file_path, "w") as writer: + for result in search_results: + writer.write(result.dict()) + + logger.info(f"Saved {len(search_results)} search results to {output_file_path}") + + def search_queries_and_save(self, queries: List[Query], output_file_path: str): + """ + Search all queries using Zoekt and save results to file. + + Args: + queries: List of Query objects to search + output_file_path: Path to save the search results + """ + search_results = [] + + logger.info(f"Starting search for {len(queries)} queries") + + for i, query in enumerate(queries): + logger.info(f"Processing query {i+1}/{len(queries)}: {query.query[:100]}...") + + files = self.zoekt_search_on_query_point(query) + searched_response = self.post_process_search_results(files, query) + search_results.append(searched_response) + + # Log progress + if searched_response.success: + logger.info(f"Query {i+1} successful: found {len(searched_response.contexts)} contexts") + else: + logger.info(f"Query {i+1} returned no results") + + # Save all results + self.save_search_results_to_file(search_results, output_file_path) + return search_results diff --git a/security_pattern_miner/src/runner.py b/security_pattern_miner/src/runner.py index 34c31c4..64060a7 100644 --- a/security_pattern_miner/src/runner.py +++ b/security_pattern_miner/src/runner.py @@ -9,7 +9,9 @@ from config.crawler import GitCrawlerConfig from config.libraries_io import LibrariesIOConfig from config.queries_loader import QueriesLoaderConfig +from config.zoekt import ZoektConfig from context_retriever.queries_loader import QueriesLoader +from context_retriever.zoekt_retriever import ZoektSearchRequester dependent_miners = { (PYTHON, PYPI): PythonDependentMiner, @@ -80,7 +82,7 @@ def run(self, package_names: list[str]): class SecurityPatternExtractor: - """Handles construction of security pattern queries""" + """Handles construction of security pattern queries and retrieval of contexts using Zoekt""" def __init__(self, args): self.args = args @@ -89,6 +91,12 @@ def __init__(self, args): QueriesLoaderConfig.root_data_dir = args.root_data_dir QueriesLoaderConfig.repos_name_dir = os.path.join(args.root_data_dir, "dependent_repos_info") QueriesLoaderConfig.output_queries_dir = os.path.join(args.root_data_dir, "output_queries") + ZoektConfig.root_data_dir = args.root_data_dir + ZoektConfig.cloned_repos_dir = os.path.join(args.root_data_dir, "cloned_repos") + ZoektConfig.search_results_dir = os.path.join(args.root_data_dir, "search_results") + + if args.zoekt_url: + ZoektConfig.zoekt_url = args.zoekt_url if not args.pattern: raise ValueError("Pattern is required for query construction") @@ -96,15 +104,20 @@ def __init__(self, args): self.query_constructor = QueriesLoader( language=args.language.lower(), pattern=args.pattern, + web_framework=args.web_framework, config=QueriesLoaderConfig ) + + # Initialize Zoekt searcher if search is enabled + if args.search_queries: + self.zoekt_searcher = ZoektSearchRequester(ZoektConfig) def construct_queries(self): self.query_constructor.load_from_pattern_metadata_file( file_path=os.path.join('./context_retriever/queries_library', self.query_constructor.yaml_path_postfix) ) queries = self.query_constructor.load_queries() - print(queries) + logger.info(f"Loaded {len(queries)} queries for pattern {self.args.pattern}") # Ensure output directory exists os.makedirs(QueriesLoaderConfig.output_queries_dir, exist_ok=True) @@ -112,6 +125,41 @@ def construct_queries(self): output_file_path = os.path.join(QueriesLoaderConfig.output_queries_dir, f"{self.args.pattern}_{self.args.web_framework}_queries.jsonl") self.query_constructor.save_queries_to_file(output_file_path) logger.info(f"Queries saved to {output_file_path}") + + return queries + + def search_and_save_results(self, queries): + """Search queries using Zoekt and save results""" + if not hasattr(self, 'zoekt_searcher'): + logger.error("Zoekt searcher not initialized. Use --search_queries flag.") + return + + # Ensure search results directory exists + os.makedirs(ZoektConfig.search_results_dir, exist_ok=True) + + search_results_file = os.path.join( + ZoektConfig.search_results_dir, + f"{self.args.pattern}_{self.args.web_framework}_search_results.jsonl" + ) + + logger.info(f"Starting search for {len(queries)} queries using Zoekt at {ZoektConfig.zoekt_url}") + search_results = self.zoekt_searcher.search_queries_and_save(queries, search_results_file) + + # Log summary statistics + successful_searches = sum(1 for result in search_results if result.success) + total_contexts = sum(len(result.contexts) for result in search_results) + + logger.info(f"Search completed: {successful_searches}/{len(queries)} queries successful, {total_contexts} total contexts found") + logger.info(f"Search results saved to {search_results_file}") + + def run(self): + """Main execution method for the extractor""" + # Step 1: Construct queries + queries = self.construct_queries() + + # Step 2: Search queries if enabled + if self.args.search_queries: + self.search_and_save_results(queries) def create_miner_parser(): @@ -142,10 +190,12 @@ def create_extractor_parser(): parser = argparse.ArgumentParser(description="Extract security pattern queries") parser.add_argument("--construct_queries", action="store_true", help="Flag to construct queries based on the specified pattern") + parser.add_argument("--search_queries", action="store_true", help="Flag to search constructed queries using Zoekt") parser.add_argument("--pattern", type=str, required=True, help="Security pattern name for query construction") parser.add_argument("--web_framework", type=str, default="fastapi", help="Web framework name for query construction") parser.add_argument("--language", type=str, default=PYTHON, help="Programming language") parser.add_argument("--root_data_dir", type=str, default="/data", help="Directory to save query outputs") + parser.add_argument("--zoekt_url", type=str, help="Zoekt search API URL (overrides environment variable)") return parser @@ -169,7 +219,7 @@ def create_extractor_parser(): args = parser.parse_args() extractor = SecurityPatternExtractor(args) - extractor.construct_queries() + extractor.run() else: print("Error: Please specify either mining arguments (--get_dependents, --crawl_only, --clean_only) or extraction arguments (--construct_queries)") From 4601b5b1616ff69dff8d162b0321c0d2db607e87 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Sun, 9 Nov 2025 02:58:02 +0000 Subject: [PATCH 19/26] pwd pattern --- docs/zoekt_queries_security_patterns.md | 325 ++++++++++++++++++++++++ 1 file changed, 325 insertions(+) create mode 100644 docs/zoekt_queries_security_patterns.md diff --git a/docs/zoekt_queries_security_patterns.md b/docs/zoekt_queries_security_patterns.md new file mode 100644 index 0000000..94a7ecc --- /dev/null +++ b/docs/zoekt_queries_security_patterns.md @@ -0,0 +1,325 @@ +# Zoekt Queries for Security Pattern Role Detection + +This table provides Zoekt search queries to locate files containing implementations of specific security pattern roles in Python FastAPI projects. + +--- + +## 1. Password-Based Authentication + +| Role | Zoekt Query | Description | +|------|-------------|-------------| +| **Enforcer** | `OAuth2PasswordBearer OAuth2PasswordRequestForm` | Files containing both OAuth2 password authentication classes | +| **Enforcer** | `class OAuth2PasswordBearer case:yes` | Files with OAuth2PasswordBearer (case-sensitive) | +| **Enforcer** | `HTTPBasic HTTPBasicCredentials` | Files implementing HTTP Basic authentication | +| **Verification Manager** | `CryptContext pwd_context.verify` | Files with password verification context | +| **Verification Manager** | `def verify_password lang:python` | Python functions that verify passwords | +| **Comparator** | `pwd_context.verify bcrypt.checkpw` | Files containing password comparison functions | +| **Hasher** | `pwd_context.hash bcrypt.hashpw` | Files with password hashing implementations | +| **Hasher** | `CryptContext schemes bcrypt` | Files configuring bcrypt hashing | +| **Password Store** | `hashed_password User.query` | Files with user password storage queries | +| **Password Store** | `class User password sqlalchemy` | SQLAlchemy User models with password fields | +| **Pepper Store** | `Fernet pepper_key encrypt` | Files implementing pepper encryption | +| **Encrypter** | `from cryptography.fernet import Fernet` | Files importing Fernet encryption | +| **System** | `@app.post /login async def` | Login endpoint implementations | +| **Registrar** | `@app.post /register pwd_context.hash` | Registration endpoints with password hashing | +| **Registrar** | `def register_user db.add` | User registration functions | +| **Password Policy** | `@validator password len` | Pydantic password validators | +| **Password Policy** | `class PasswordPolicy validate` | Password policy validation classes | +| **SRNG** | `secrets.token_urlsafe secrets.token_bytes` | Secure random number generation | + +--- + +## 2. Verifiable Token-Based Authentication (JWT) + +| Role | Zoekt Query | Description | +|------|-------------|-------------| +| **Enforcer** | `HTTPBearer OAuth2PasswordBearer` | Bearer token enforcement | +| **Enforcer** | `HTTPAuthorizationCredentials Depends` | HTTP authorization credentials dependency | +| **Verifier** | `jwt.decode SECRET_KEY algorithms` | JWT token verification | +| **Verifier** | `def verify_token jwt.decode` | Token verification functions | +| **Cryptographer (MAC)** | `jwt.encode HS256 SECRET_KEY` | JWT encoding with HMAC | +| **Cryptographer (MAC)** | `from jose import jwt` | Python-jose JWT import | +| **Cryptographer (Sig)** | `jwt.encode RS256 private_key` | JWT encoding with RSA signature | +| **Cryptographer (Sig)** | `jwt.decode public_key RS256` | JWT verification with public key | +| **Key Manager (RSA)** | `rsa.generate_private_key public_exponent` | RSA key generation | +| **Key Manager (RSA)** | `from cryptography.hazmat.primitives.asymmetric import rsa` | RSA key management imports | +| **Token Generator** | `def create_access_token jwt.encode` | Access token creation functions | +| **Token Generator** | `jwt.encode exp sub` | JWT encoding with expiration and subject | +| **Registrar** | `@app.post /token create_access_token` | Token issuance endpoints | +| **Registrar** | `return access_token token_type bearer` | Token response formatting | +| **Token Blacklist** | `redis.sadd revoked_tokens jwt` | Token revocation with Redis | +| **Token Blacklist** | `blacklist.revoke token` | Token blacklist management | + +--- + +## 3. Opaque Token-Based Authentication (Session) + +| Role | Zoekt Query | Description | +|------|-------------|-------------| +| **Enforcer** | `APIKeyCookie name session_id` | Cookie-based session enforcement | +| **Enforcer** | `APIKeyHeader X-Session-Token` | Header-based session enforcement | +| **Verifier** | `async def verify_session session_manager` | Session verification functions | +| **Verifier** | `session_id HTTPException 401` | Session validation with error handling | +| **Principal Provider (Redis)** | `redis.setex session: principal` | Redis session storage | +| **Principal Provider (Redis)** | `await redis.get session:` | Async Redis session retrieval | +| **Principal Provider (Memory)** | `TTLCache maxsize ttl` | In-memory session cache | +| **Token Generator** | `secrets.token_urlsafe secrets.token_hex` | Secure session token generation | +| **Token Generator** | `session_id = secrets.token` | Session ID generation | +| **Registrar** | `@app.post /login response.set_cookie session_id` | Login with session cookie creation | +| **Registrar** | `redis.setex httponly secure` | Session registration with secure cookies | +| **Session Manager** | `class SessionManager create_session` | Session management class | +| **Session Manager** | `async def get_principal async def invalidate_session` | Session lifecycle methods | +| **Session Manager** | `session_timeout absolute_timeout` | Session timeout configuration | + +--- + +## 4. Session-Based Access Control + +| Role | Zoekt Query | Description | +|------|-------------|-------------| +| **Authentication Enforcer** | `APIKeyCookie session_id Depends` | Session-based authentication enforcement | +| **Verifier** | `session_manager.get_session HTTPException` | Session verification with error handling | +| **Session Manager** | `class SessionManager permissions role` | Session manager with authorization data | +| **Session Manager** | `redis.setex session: SessionData.json` | Session storage with permissions | +| **Session ID Generator** | `secrets.token_urlsafe 32` | Session ID generation (32 bytes) | +| **Authorization Enforcer** | `class AuthorizationChecker required_permissions` | Authorization enforcement class | +| **Authorization Enforcer** | `def __call__ session_id required_permissions` | Authorization checker callable | +| **Decider** | `required_permissions.issubset session.permissions` | Permission checking logic | +| **Decider** | `HTTPException 403 Missing permissions` | Authorization denial | +| **Policy Provider** | `class PolicyProvider get_role_permissions` | Policy management class | +| **Policy Provider** | `role_permissions user_roles` | Role-permission mapping | +| **Policy Provider** | `db.query Role permissions` | Database role queries | +| **Registrar** | `create_session policy_provider.get_user_permissions` | Session creation with permissions | +| **Combined Enforcer** | `async def verify_credentials session.permissions` | Combined auth+authz verification | + +--- + +## 5. Obscure Token-Based Access Control (API Keys) + +| Role | Zoekt Query | Description | +|------|-------------|-------------| +| **Enforcer** | `APIKeyHeader X-API-Key auto_error` | API key header enforcement | +| **Enforcer** | `APIKeyQuery api_key Security` | API key query parameter enforcement | +| **Validator (Combined)** | `class APIKeyValidator required_permissions` | Combined auth+authz validator | +| **Validator** | `validate_token HTTPException 401 403` | API key validation with auth/authz errors | +| **Hasher** | `hashlib.sha256 token.encode hexdigest` | Token hashing with SHA-256 | +| **Hasher** | `hashlib.blake2b token_hash` | Token hashing with BLAKE2b | +| **Token Manager** | `class APIKey token_hash principal permissions` | API key database model | +| **Token Manager** | `db.query APIKey token_hash` | API key database queries | +| **Token Generator** | `secrets.token_urlsafe 32` | Secure API key generation (32+ bytes) | +| **Registrar** | `@app.post /api-keys secrets.token` | API key creation endpoint | +| **Registrar** | `token_hash = hashlib.sha256 db.add APIKey` | API key registration with hashing | +| **Registrar** | `return api_key Save this key` | API key response (shown once) | +| **Permission Checker** | `token_info.permissions HTTPException 403` | Permission verification | +| **Revocation** | `@app.delete /api-keys db.delete` | API key revocation endpoint | + +--- + +## Cross-Pattern Queries + +### Database Integration (Password Store, Token Manager, Policy Provider) + +| Component | Zoekt Query | Description | +|-----------|-------------|-------------| +| **SQLAlchemy Models** | `class User Base __tablename__` | SQLAlchemy user model definitions | +| **SQLAlchemy Queries** | `db.query User filter username` | User database queries | +| **Tortoise ORM** | `from tortoise import fields models` | Tortoise ORM imports | +| **Tortoise Queries** | `await User.get username` | Async user queries | +| **Database Session** | `SessionLocal sessionmaker create_engine` | Database session management | + +### Redis Integration (Session Storage, Token Blacklist) + +| Component | Zoekt Query | Description | +|-----------|-------------|-------------| +| **Redis Connection** | `redis.from_url redis://` | Redis connection setup | +| **Redis Session** | `redis.setex session: ttl` | Session storage with expiration | +| **Redis Async** | `await redis.get await redis.setex` | Async Redis operations | +| **Redis Set Operations** | `redis.sadd redis.sismember` | Set operations for blacklists | + +### Cryptography Operations (Hasher, Encrypter, Key Manager) + +| Component | Zoekt Query | Description | +|-----------|-------------|-------------| +| **Passlib** | `from passlib.context import CryptContext` | Passlib password hashing | +| **Bcrypt** | `import bcrypt hashpw gensalt` | Bcrypt password hashing | +| **Fernet Encryption** | `from cryptography.fernet import Fernet` | Symmetric encryption | +| **RSA Keys** | `from cryptography.hazmat.primitives.asymmetric` | Asymmetric cryptography | +| **JWT Operations** | `from jose import jwt encode decode` | JWT token operations | +| **Secure Random** | `import secrets token_urlsafe` | Cryptographically secure random | + +--- + +## Advanced Zoekt Queries + +### Finding Complete Pattern Implementations + +| Pattern | Zoekt Query | Description | +|---------|-------------|-------------| +| **Complete Password Auth** | `OAuth2PasswordBearer CryptContext pwd_context.hash pwd_context.verify lang:python` | Files with complete password authentication | +| **Complete JWT Auth** | `HTTPBearer jwt.encode jwt.decode SECRET_KEY lang:python` | Files with complete JWT authentication | +| **Complete Session Auth** | `APIKeyCookie secrets.token redis.setex response.set_cookie lang:python` | Files with complete session authentication | +| **Complete API Key** | `APIKeyHeader hashlib.sha256 secrets.token db.query APIKey lang:python` | Files with complete API key authentication | + +### Finding Security Anti-Patterns + +| Anti-Pattern | Zoekt Query | Description | +|--------------|-------------|-------------| +| **Weak Password Storage** | `password = User.password -hash -bcrypt -crypt` | Passwords stored without hashing | +| **Hardcoded Secrets** | `SECRET_KEY = "` | Hardcoded secret keys | +| **No HTTPS** | `set_cookie -secure -httponly` | Cookies without security flags | +| **Weak Random** | `random.randint -secrets -uuid4` | Weak random number generation | +| **SQL Injection Risk** | `f"SELECT * FROM users WHERE username = '{username}'"` | String interpolation in SQL | + +### Finding Authentication Endpoints + +| Endpoint Type | Zoekt Query | Description | +|---------------|-------------|-------------| +| **Login** | `@app.post /login /signin /auth` | Login endpoint definitions | +| **Registration** | `@app.post /register /signup` | Registration endpoints | +| **Logout** | `@app.post /logout invalidate delete_cookie` | Logout endpoints | +| **Token Refresh** | `@app.post /refresh /token/refresh` | Token refresh endpoints | +| **Password Reset** | `@app.post /reset-password /forgot-password` | Password reset endpoints | + +### Finding Authorization Logic + +| Authorization Type | Zoekt Query | Description | +|-------------------|-------------|-------------| +| **Permission Checks** | `required_permissions HTTPException 403` | Permission verification code | +| **Role Checks** | `role == Role.ADMIN session.role` | Role-based access control | +| **Scope Checks** | `SecurityScopes scopes` | OAuth2 scope verification | +| **Resource Ownership** | `if user.id == resource.owner_id` | Resource ownership checks | + +--- + +## File Type Filters + +Add these to any query to filter by file type: + +| Filter | Zoekt Syntax | Description | +|--------|-------------|-------------| +| **Python Files** | `lang:python` | Only Python files | +| **Python Files** | `f:\\.py$` | Files ending in .py | +| **Config Files** | `file:config file:settings` | Configuration files | +| **Main/Init Files** | `file:__init__ file:main` | Main entry points | +| **Test Files** | `file:test_ file:_test` | Test files | +| **Exclude Tests** | `-file:test -file:_test` | Exclude test files | + +--- + +## Repository Filters + +Filter searches by repository characteristics: + +| Filter | Zoekt Syntax | Description | +|--------|-------------|-------------| +| **FastAPI Repos** | `r:fastapi` | Repositories with "fastapi" in name | +| **Python Repos** | `lang:python r:api r:auth` | Python repos with api/auth in name | +| **Exclude Forks** | `fork:no` | Exclude forked repositories | +| **Archived** | `archived:no` | Exclude archived repositories | +| **Public Only** | `public:yes` | Only public repositories | + +--- + +## Combined Query Examples + +### Example 1: Find Password Authentication in FastAPI Projects +``` +OAuth2PasswordBearer CryptContext lang:python f:\.py$ fork:no archived:no +``` + +### Example 2: Find JWT Token Generation with RS256 +``` +jwt.encode RS256 private_key create_access_token lang:python -file:test +``` + +### Example 3: Find Session Management with Redis +``` +APIKeyCookie redis.setex session: response.set_cookie httponly lang:python +``` + +### Example 4: Find API Key Authentication +``` +APIKeyHeader hashlib.sha256 token_hash permissions lang:python -file:test +``` + +### Example 5: Find Complete Session-Based Access Control +``` +APIKeyCookie required_permissions session.permissions HTTPException 403 lang:python +``` + +### Example 6: Find Secure Cookie Implementation +``` +set_cookie httponly secure samesite session_id lang:python +``` + +### Example 7: Find Password Reset Implementation +``` +@app.post reset-password secrets.token send_email lang:python +``` + +### Example 8: Find Multi-Factor Authentication +``` +OAuth2PasswordBearer pyotp totp verify lang:python +``` + +--- + +## Pattern Detection Strategy + +For each security pattern, use this search strategy: + +### 1. **Identify Enforcer** (Entry Point) +Search for FastAPI security dependencies to find files that enforce authentication. + +### 2. **Identify Verifier** (Core Logic) +Search for verification functions that validate credentials/tokens. + +### 3. **Identify Storage** (Persistence) +Search for database models and Redis operations for credential/session storage. + +### 4. **Identify Generators** (Token/Password Creation) +Search for functions that create new credentials or tokens. + +### 5. **Verify Complete Implementation** +Combine all role queries to find files with complete pattern implementations. + +--- + +## Query Optimization Tips + +1. **Start Broad**: Begin with key terms like `OAuth2PasswordBearer` or `jwt.encode` +2. **Add Context**: Add co-occurring terms like `CryptContext lang:python` +3. **Filter Noise**: Use `-file:test` to exclude test files +4. **Use Case Sensitivity**: Use `case:yes` for exact class/function names +5. **Combine Related Terms**: Use space for AND, `OR` for alternatives +6. **Exclude Negations**: Use `-term` to exclude unwanted matches +7. **File Extensions**: Use `f:\.py$` to ensure Python files only +8. **Repository Quality**: Add `fork:no archived:no` for active projects + +--- + +## Usage in SecPat Tool + +To use these queries in your SecPat pipeline: + +```python +# Example: Search for Password-Based Authentication Enforcer +query = "OAuth2PasswordBearer OAuth2PasswordRequestForm lang:python" +results = zoekt_search(query, repository_path) + +# Example: Find complete JWT implementation +query = "HTTPBearer jwt.encode jwt.decode SECRET_KEY lang:python -file:test" +results = zoekt_search(query, repository_path) + +# Example: Multi-role search for Session-Based Access Control +queries = { + "enforcer": "APIKeyCookie session_id Depends", + "verifier": "session_manager.get_session HTTPException", + "policy_provider": "get_role_permissions user_roles", + "decider": "required_permissions.issubset HTTPException 403" +} + +for role, query in queries.items(): + results[role] = zoekt_search(f"{query} lang:python", repo_path) +``` + From 2271809b141f7ae3ef2b54ce932b78d83723fa2c Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Sun, 9 Nov 2025 10:07:13 +0000 Subject: [PATCH 20/26] docker compose run --- docker-compose.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index ede8b09..bd12038 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -27,7 +27,11 @@ services: - ./build/volumes/data:/data env_file: - ./.env - command: ["python", "./runner.py", "--construct_queries", "--pattern", "password_based_authentication", "--web_framework", "fastapi", "--language", "python", "--root_data_dir=/data"] + environment: + - ZOEKT_URL=http://zoekt-webserver:6070/api/search + command: ["python", "./runner.py", "--construct_queries", "--search_queries", "--pattern", "password_based_authentication", "--web_framework", "fastapi", "--language", "python", "--root_data_dir=/data"] + depends_on: + - zoekt-webserver deploy: resources: limits: From 1befd68910feecf4dbc0eae3a8708310d53e4987 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Sun, 9 Nov 2025 11:07:20 +0000 Subject: [PATCH 21/26] update artifact docs --- docs/README_YAML_QUERIES.md | 366 ++++ docs/artifact_doc.md | 385 ++++ docs/fastapi_security_pattern_mapping.md | 1559 +++++++++++++++++ ...tern_to_dependencies_signatures_mapping.md | 305 ++++ 4 files changed, 2615 insertions(+) create mode 100644 docs/README_YAML_QUERIES.md create mode 100644 docs/artifact_doc.md create mode 100644 docs/fastapi_security_pattern_mapping.md create mode 100644 docs/pattern_to_dependencies_signatures_mapping.md diff --git a/docs/README_YAML_QUERIES.md b/docs/README_YAML_QUERIES.md new file mode 100644 index 0000000..cc3f809 --- /dev/null +++ b/docs/README_YAML_QUERIES.md @@ -0,0 +1,366 @@ +# Security Pattern Zoekt Query Definitions + +This directory contains YAML files defining Zoekt search queries for detecting security pattern implementations in Python FastAPI codebases. These files are used by the `SecurityPatternExtractor` to construct role-specific queries for finding pattern implementations in indexed repositories. + +## Purpose + +The YAML files serve as metadata for each security pattern, mapping pattern roles (from Van den Berghe's security patterns) to concrete implementation signatures found in security libraries. The `QueriesLoader` reads these files and generates Zoekt queries that search for specific API usage patterns in repositories. + +## Files Structure + +``` +context_retriever/queries_library/ +└── python/ + └── fastapi/ + └── patterns/ + ├── password_based_authentication.yaml + ├── verifiable_token_authentication.yaml (planned) + ├── opaque_token_authentication.yaml (planned) + ├── session_based_access_control.yaml (planned) + └── obscure_token_access_control.yaml (planned) +``` + +### Currently Implemented + +1. **password_based_authentication.yaml** - Password-Based Authentication pattern + +### Planned + +2. **verifiable_token_authentication.yaml** - Verifiable Token-Based Authentication (JWT) pattern +3. **opaque_token_authentication.yaml** - Opaque Token-Based Authentication (Session) pattern +4. **session_based_access_control.yaml** - Session-Based Access Control pattern +5. **obscure_token_access_control.yaml** - Obscure Token-Based Access Control (API Keys) pattern + +## YAML Structure + +Each YAML file follows this structure: + +```yaml +pattern: + name: "Pattern Name" + id: "01_01_XXX" # Van den Berghe catalogue ID + description: "Brief description of the pattern" + language: python + web_framework: fastapi + +# Repositories metadata file (from Phase 1: Mining) +repo_metadata_file: + - "fastapi_passlib_mutual_dependents.jsonl" + +# Dependencies required for this pattern +dependencies: + - fastapi + - passlib + +# Pattern roles and their search queries +roles: + role_name: + description: "What this role does in the pattern" + queries: + - query: "specific_api_call(" + description: "What this query finds" + priority: high # high|medium|low + - query: "another_signature" + description: "Alternative implementation" + priority: medium + + another_role: + description: "Another role description" + queries: + - query: "role_specific_api" + description: "Query description" + priority: high + +# Optional: queries for complete implementations +complete_implementation: + description: "Queries for finding complete pattern implementations" + queries: + - query: "term1 AND term2 AND term3" + description: "What complete implementation looks like" + priority: critical + min_matches: 3 # minimum terms that should match + +# Optional: endpoint patterns to search for +endpoints: + endpoint_type: + - "/login" + - "/register" + - "/auth/token" + +# Optional: anti-patterns to detect +anti_patterns: + description: "Security anti-patterns to detect" + pattern_name: + query: "dangerous_pattern" + severity: critical # critical|high|medium|low + description: "Why this is dangerous" + +# Optional: search filters +filters: + language: python + file_extension: "\\.py$" + exclude_tests: true + exclude_patterns: + - "test_" + - "tests/" +``` + +## How It Works + +### 1. Loading Pattern Metadata + +```python +# In SecurityPatternExtractor +self.query_constructor = QueriesLoader( + language="python", + pattern="password_based_authentication", + web_framework="fastapi", + config=QueriesLoaderConfig +) + +# Load YAML file +self.query_constructor.load_from_pattern_metadata_file( + file_path="./context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml" +) +``` + +### 2. Generating Queries + +For each role in the YAML file, the `QueriesLoader`: +1. Reads the `repo_metadata_file` to get list of repositories from Phase 1 +2. For each repository and each role: + - Takes the query string from YAML + - Adds language filter: `lang:python` + - Adds repository constraint: `r:github.com/owner/repo` + - Creates a `Query` object + +Example transformation: +```yaml +# YAML +roles: + hasher: + queries: + - query: "pwd_context.hash(" +``` + +Becomes: +```python +Query( + repo="github.com/owner/repo", + role="hasher", + query="pwd_context.hash( lang:python r:github.com/owner/repo", + webframework="fastapi", + pattern="password_based_authentication" +) +``` + +### 3. Searching with Zoekt + +The generated queries are sent to Zoekt, which: +1. Searches the indexed codebases +2. Returns file matches with line numbers +3. Extracts code snippets showing the implementation + +### 4. Output Format + +Queries are saved to: `build/volumes/data/output_queries/__queries.jsonl` + +Search results are saved to: `build/volumes/data/search_results/__search_results.jsonl` + +## Example: Password-Based Authentication + +### YAML Definition (simplified) + +```yaml +pattern: + name: "Password-Based Authentication" + id: "01_01_password_based_authentication" + +repo_metadata_file: + - "fastapi_passlib_mutual_dependents.jsonl" + +dependencies: + - fastapi + - passlib + +roles: + enforcer: + description: "Enforces authentication requirement" + queries: + - query: "OAuth2PasswordBearer(" + description: "FastAPI OAuth2 password flow" + priority: high + - query: "OAuth2PasswordRequestForm" + description: "Form for password authentication" + priority: high + + hasher: + description: "Hashes passwords for storage" + queries: + - query: "pwd_context.hash(" + description: "Passlib password hashing" + priority: high + - query: "CryptContext(schemes=" + description: "Passlib context initialization" + priority: high + + comparator: + description: "Verifies password against hash" + queries: + - query: "pwd_context.verify(" + description: "Passlib password verification" + priority: high +``` + +### Generated Queries + +If repository `github.com/user/fastapi-auth-app` is in the metadata file: + +```json +[ + { + "repo": "github.com/user/fastapi-auth-app", + "role": "enforcer", + "query": "OAuth2PasswordBearer( lang:python r:github.com/user/fastapi-auth-app", + "webframework": "fastapi", + "pattern": "password_based_authentication" + }, + { + "repo": "github.com/user/fastapi-auth-app", + "role": "hasher", + "query": "pwd_context.hash( lang:python r:github.com/user/fastapi-auth-app", + "webframework": "fastapi", + "pattern": "password_based_authentication" + }, + { + "repo": "github.com/user/fastapi-auth-app", + "role": "comparator", + "query": "pwd_context.verify( lang:python r:github.com/user/fastapi-auth-app", + "webframework": "fastapi", + "pattern": "password_based_authentication" + } +] +``` + +### Search Results + +```json +{ + "repo": "github.com/user/fastapi-auth-app", + "role": "hasher", + "query": "pwd_context.hash( lang:python r:github.com/user/fastapi-auth-app", + "webframework": "fastapi", + "pattern": "password_based_authentication", + "success": true, + "contexts": [ + { + "filepath": "user_fastapi-auth-app/app/auth.py", + "start_line": 15, + "end_line": 20, + "snippet": "def hash_password(password: str) -> str:\n \"\"\"Hash a password using bcrypt.\"\"\"\n return pwd_context.hash(password)\n" + } + ] +} +``` + +## Query Syntax + +Queries use Zoekt's search syntax, which supports: + +- **Literal strings**: `OAuth2PasswordBearer(` +- **Regular expressions**: `pwd_context\.(hash|verify)` +- **Language filters**: `lang:python` +- **Repository filters**: `r:github.com/owner/repo` +- **File filters**: `file:\.py$` +- **Boolean operators**: `term1 AND term2`, `term1 OR term2` +- **Case sensitivity**: Default is case-sensitive + +For more details, see [Zoekt documentation](https://github.com/sourcegraph/zoekt). + +## Adding New Patterns + +To add a new security pattern: + +1. **Create YAML file**: `context_retriever/queries_library/python/fastapi/patterns/.yaml` + +2. **Define pattern metadata**: + ```yaml + pattern: + name: "Your Pattern Name" + id: "XX_XX_pattern_id" + description: "Pattern description" + ``` + +3. **Specify repository metadata file** (from Phase 1): + ```yaml + repo_metadata_file: + - "package1_package2_mutual_dependents.jsonl" + ``` + +4. **Define roles and queries**: + ```yaml + roles: + role_name: + description: "Role description" + queries: + - query: "api_signature(" + description: "What this finds" + priority: high + ``` + +5. **Run extraction**: + ```bash + python src/runner.py \ + --construct_queries \ + --search_queries \ + --pattern \ + --web_framework fastapi \ + --language python + ``` + +## Best Practices + +1. **Query Specificity**: Make queries specific enough to avoid false positives + - Good: `pwd_context.hash(` (targets specific API) + - Bad: `hash(` (too generic) + +2. **Priority Levels**: + - `high`: Core pattern implementation signatures + - `medium`: Alternative implementations or helper functions + - `low`: Related but not essential patterns + +3. **Multiple Queries per Role**: Provide alternatives for different implementations + ```yaml + hasher: + queries: + - query: "pwd_context.hash(" # Passlib + - query: "bcrypt.hashpw(" # Direct bcrypt + ``` + +4. **Repository Metadata**: Ensure the `repo_metadata_file` matches the output from Phase 1 mining + +5. **Testing**: Test queries in Zoekt web interface first (`http://localhost:6070`) before adding to YAML + +## Integration with Pipeline + +``` +Phase 1: Mining + ├─ Find repositories using security libraries + ├─ Save to: fastapi_passlib_mutual_dependents.jsonl + └─ Clone and index repositories + +Phase 2: Extraction + ├─ Load YAML: password_based_authentication.yaml + ├─ Read repo_metadata_file: fastapi_passlib_mutual_dependents.jsonl + ├─ Generate queries for each role × repository + ├─ Search using Zoekt API + └─ Save results with code contexts +``` + +## References + +- Van den Berghe's security patterns catalog +- FastAPI security documentation +- Passlib documentation +- Python-JOSE documentation +- Zoekt query syntax diff --git a/docs/artifact_doc.md b/docs/artifact_doc.md new file mode 100644 index 0000000..6e32643 --- /dev/null +++ b/docs/artifact_doc.md @@ -0,0 +1,385 @@ +## Project Overview + +This is a security pattern mining research project that identifies and analyzes how security libraries are integrated in real-world applications. The pipeline consists of two main phases: + +### Phase 1: Repository Mining +1. Queries Libraries.io API to find packages that depend on specific security libraries +2. Identifies repositories that use multiple security packages (mutual dependents) +3. Clones these repositories for analysis +4. Indexes them using Zoekt for searchable code analysis + +### Phase 2: Pattern Extraction +1. Loads security pattern metadata from YAML files (based on Van den Berghe's security patterns) +2. Constructs role-specific Zoekt queries for each security pattern +3. Searches indexed repositories using Zoekt to find pattern implementations +4. Extracts code contexts showing how each pattern role is implemented +5. Saves search results with file paths, line numbers, and code snippets + +## Key Commands + +### Phase 1: Repository Mining + +**Full pipeline (fetch + crawl):** +```bash +python security_pattern_miner/src/runner.py \ + --get_dependents \ + --package_names fastapi passlib \ + --language python \ + --package_manager Pypi \ + --root_data_dir=./build/volumes/data \ + --max_pages=10 \ + --per_page=100 +``` + +**Crawl only (using previously fetched data):** +```bash +python security_pattern_miner/src/runner.py \ + --crawl_only \ + --package_names fastapi passlib \ + --language python \ + --package_manager Pypi \ + --root_data_dir=./build/volumes/data \ + --start_index=0 \ + --end_index=10 +``` + +**Clean saved dependent files:** +```bash +python security_pattern_miner/src/runner.py \ + --clean_only \ + --package_names fastapi passlib \ + --language python \ + --package_manager Pypi +``` + +### Phase 2: Pattern Extraction + +**Construct queries only:** +```bash +python security_pattern_miner/src/runner.py \ + --construct_queries \ + --pattern password_based_authentication \ + --web_framework fastapi \ + --language python \ + --root_data_dir=./build/volumes/data +``` + +**Construct and search queries:** +```bash +python security_pattern_miner/src/runner.py \ + --construct_queries \ + --search_queries \ + --pattern password_based_authentication \ + --web_framework fastapi \ + --language python \ + --root_data_dir=./build/volumes/data \ + --zoekt_url=http://localhost:6070/api/search +``` + +### Using Docker Compose + +**Run mining service:** +```bash +docker compose up security_pattern_miner +``` + +**Run extraction service (with search):** +```bash +docker compose up security_pattern_extractor +``` + +**Run Zoekt services:** +```bash +# Start indexer (one-time or periodic re-indexing) +docker compose up zoekt-indexer + +# Start web server (for search interface and API) +docker compose up zoekt-webserver +``` + +**Access Zoekt web search interface:** +``` +http://localhost:6070 +``` + +## Architecture + +### Core Components + +**1. Security Pattern Miner** (`SecurityPatternMiner` class) +- Queries Libraries.io API to find packages that depend on target security libraries +- Supports Python (PyPI) and Java (Maven) +- Key operations: + - `get_dependents()` - Fetches paginated dependents from Libraries.io API + - `find_mutual_dependents()` - Finds intersection of dependent repos across multiple packages + - `save_mutual_dependents()` - Persists results to JSONL format + - `clean_saved_dependents()` - Deduplicates saved files + +**2. Repository Crawler** (`repo_crawler.base.GitCrawler`) +- Clones GitHub repositories using GitPython +- Requires GitHub authentication via environment variables +- Key operations: + - `crawl()` - Clones single repository + - `crawl_from_dependent_repos_info()` - Bulk cloning with progress tracking + - `load_dependedent_repos_info()` - Loads JSONL repo metadata + +**3. Security Pattern Extractor** (`SecurityPatternExtractor` class) +- Constructs Zoekt queries from security pattern metadata (YAML files) +- Searches indexed repositories for pattern implementations +- Extracts code contexts for each pattern role +- Key operations: + - `construct_queries()` - Builds queries from YAML pattern definitions + - `search_and_save_results()` - Executes Zoekt searches and saves results + +**4. Queries Loader** (`context_retriever.queries_loader.QueriesLoader`) +- Loads security pattern metadata from YAML files +- Maps pattern roles to specific API signatures and usage patterns +- Generates repository-specific queries +- Key operations: + - `load_from_pattern_metadata_file()` - Parses YAML pattern definitions + - `load_queries()` - Creates Query objects for each role/repo combination + - `save_queries_to_file()` - Persists queries to JSONL + +**5. Zoekt Search Requester** (`context_retriever.zoekt_retriever.ZoektSearchRequester`) +- Interfaces with Zoekt search API +- Processes search results and extracts code contexts +- Key operations: + - `zoekt_search_request()` - Sends search queries to Zoekt API + - `post_process_search_results()` - Extracts file paths, line numbers, snippets + - `search_queries_and_save()` - Batch search with result aggregation + - `save_search_results_to_file()` - Persists SearchedResponse objects to JSONL + +**6. Code Indexer (Zoekt)** +- Creates searchable indexes of cloned repositories +- Two services: + - `zoekt-indexer` - Builds indexes from cloned repos + - `zoekt-webserver` - Provides web search interface on port 6070 and REST API + +### Data Flow + +``` +Phase 1: Mining +SecurityPatternMiner.run() + ├─ dependent_miner.get_dependents() [Fetch from Libraries.io API] + ├─ dependent_miner.find_mutual_dependents() [Find intersection across packages] + ├─ dependent_miner.save_mutual_dependents() [Save to JSONL] + └─ repo_crawler.crawl_from_dependent_repos_info() [Clone repositories] + └─ Stored in: build/volumes/data/cloned_repos/ + └─ Indexed by Zoekt in: build/volumes/zoekt/index-data/ + +Phase 2: Extraction +SecurityPatternExtractor.run() + ├─ query_constructor.load_from_pattern_metadata_file() [Load YAML] + ├─ query_constructor.load_queries() [Generate queries] + ├─ query_constructor.save_queries_to_file() [Save queries to JSONL] + └─ zoekt_searcher.search_queries_and_save() [Search & extract contexts] + └─ Saved to: build/volumes/data/search_results/ +``` + +### Directory Structure + +``` +security_pattern_miner/ +├── src/ +│ ├── config/ # Configuration (API keys, constants, paths) +│ │ ├── constants.py # Language/platform constants +│ │ ├── libraries_io.py # Libraries.io API config +│ │ ├── crawler.py # Git crawler config +│ │ ├── queries_loader.py # Query construction config +│ │ └── zoekt.py # Zoekt search config +│ ├── dependent_miner/ # Libraries.io API integration +│ │ ├── base.py # Abstract base + LibrariesIODependentMiner +│ │ ├── python.py # PyPI implementation +│ │ └── java.py # Maven implementation +│ ├── repo_crawler/ # Git repository cloning +│ │ └── base.py # GitCrawler implementation +│ ├── context_retriever/ # Query construction & search +│ │ ├── queries_loader.py # YAML to queries +│ │ ├── zoekt_retriever.py # Zoekt search integration +│ │ └── queries_library/ # YAML pattern definitions +│ │ └── python/fastapi/patterns/ +│ │ └── password_based_authentication.yaml +│ ├── schemas/ # Pydantic data models +│ ├── utils/ # Helper functions (API, GitHub, logging) +│ └── runner.py # Main entry point +│ +build/volumes/data/ +├── dependent_repos_info/ # JSONL files with repo metadata +├── cloned_repos/ # Cloned GitHub repositories +├── output_queries/ # Generated Zoekt queries (JSONL) +└── search_results/ # Search results with code contexts (JSONL) + +zoekt/ # Submodule for code search/indexing +build/volumes/zoekt/ +└── index-data/ # Zoekt search indexes +``` + +## Important Implementation Details + +### Phase 1: Mining + +**Libraries.io API Integration** +- API key required: set `LIBRARIES_IO_API_KEY` in `.env` +- Paginated requests: configure `--max_pages` and `--per_page` +- Results saved to: `build/volumes/data/dependent_repos_info/_dependents.jsonl` +- Mutual dependents saved to: `build/volumes/data/dependent_repos_info/__mutual_dependents.jsonl` + +**Git Repository Cloning** +- GitHub authentication: set `GITHUB_TOKEN` in `.env` +- Clones stored in: `build/volumes/data/cloned_repos/` +- Use `--start_index` and `--end_index` to clone specific ranges +- Skips repositories that already exist locally + +**Zoekt Indexing** +- Automatically indexes all repositories in `cloned_repos/` +- Index stored in: `build/volumes/zoekt/index-data/` +- Supports both Git repositories and regular directories +- Re-run `zoekt-indexer` service to update indexes + +### Phase 2: Extraction + +**Pattern Metadata (YAML)** +- Located in: `context_retriever/queries_library/python/fastapi/patterns/` +- Defines pattern roles and their corresponding queries +- Maps roles to specific API signatures from security libraries +- Example: `password_based_authentication.yaml` + +**Query Construction** +- Reads YAML files containing pattern definitions +- For each role, generates queries with: + - Library-specific API signatures (e.g., `pwd_context.hash(`) + - Language filters (e.g., `lang:python`) + - Repository constraints (e.g., `r:github.com/owner/repo`) +- Queries saved to: `build/volumes/data/output_queries/__queries.jsonl` + +**Zoekt Search** +- Sends queries to Zoekt API at configured URL +- Extracts code contexts including: + - File path + - Start and end line numbers + - Code snippet (configurable: context lines or whole file) +- Search results saved to: `build/volumes/data/search_results/__search_results.jsonl` + +**Search Result Schema** +```json +{ + "repo": "github.com/owner/repo", + "role": "hasher", + "query": "pwd_context.hash lang:python r:github.com/owner/repo", + "webframework": "fastapi", + "pattern": "password_based_authentication", + "success": true, + "contexts": [ + { + "filepath": "owner_repo/app/security.py", + "start_line": 10, + "end_line": 15, + "snippet": "def hash_password(password: str):\n return pwd_context.hash(password)" + } + ] +} +``` + +### Supported Security Patterns + +Based on Van den Berghe's security patterns: +1. **Password-Based Authentication** - Dependencies: `fastapi`, `passlib` +2. **Verifiable Token-Based Authentication** - Dependencies: `fastapi`, `pyjwt` or `python-jose` +3. **Opaque Token-Based Authentication** - Dependencies: `fastapi`, `redis` or caching libraries +4. **Session-Based Access Control** - Dependencies: `fastapi`, `redis`, database ORMs +5. **Obscure Token-Based Access Control** - Dependencies: `fastapi`, database ORMs + +### Data Models + +**Query** (Pydantic model): +- `repo`: Repository full name +- `role`: Security pattern role (e.g., "hasher", "enforcer") +- `query`: Zoekt search query string +- `webframework`: Web framework name +- `pattern`: Security pattern name + +**Context** (Pydantic model): +- `filepath`: Relative path to source file +- `start_line`: Starting line number +- `end_line`: Ending line number +- `snippet`: Code snippet or full file content + +**SearchedResponse** (extends Query): +- Inherits all Query fields +- `success`: Boolean indicating if search found results +- `contexts`: List of Context objects + +## Configuration Files + +**Environment Variables** (`.env`): +- `LIBRARIES_IO_API_KEY` - Libraries.io API key +- `GITHUB_TOKEN` - GitHub personal access token +- `ZOEKT_URL` - Zoekt API endpoint (default: `http://localhost:6070/api/search`) +- `NUM_CONTEXT_LINES` - Number of context lines around matches +- `MAX_RESULTS` - Maximum results per query +- `GET_WHOLE_FILE` - Whether to retrieve entire file or just context + +**Docker Configuration** (`docker-compose.yml`): +- `security_pattern_miner` - Phase 1: Mine repositories +- `security_pattern_extractor` - Phase 2: Extract patterns (includes Zoekt search) +- `zoekt-webserver` - Zoekt search API and web interface +- `zoekt-indexer` - Indexes cloned repositories + +**Application Configuration** (`security_pattern_miner/src/config/`): +- `constants.py` - Language/platform constants +- `libraries_io.py` - Libraries.io API settings +- `crawler.py` - Git crawler settings +- `queries_loader.py` - Query construction settings +- `zoekt.py` - Zoekt search settings + +## Current Git State + +- **Active branch:** `indexer` (Zoekt integration work) +- Build artifacts and cloned repos stored in `build/` (git-ignored) +- Zoekt added as git submodule +- Two-phase pipeline: mining (Phase 1) and extraction (Phase 2) + +## Example Workflow + +**Complete end-to-end workflow:** + +```bash +# 1. Mine repositories that use both FastAPI and Passlib +docker compose up security_pattern_miner + +# 2. Index the cloned repositories +docker compose up zoekt-indexer + +# 3. Start Zoekt web server +docker compose up -d zoekt-webserver + +# 4. Extract password-based authentication pattern implementations +docker compose up security_pattern_extractor + +# 5. View results +cat build/volumes/data/search_results/password_based_authentication_fastapi_search_results.jsonl +``` + +**Manual workflow for testing:** + +```bash +# Phase 1: Mining +python src/runner.py \ + --get_dependents \ + --package_names fastapi passlib \ + --language python \ + --package_manager Pypi \ + --root_data_dir=./build/volumes/data + +# Index with Zoekt +docker compose up zoekt-indexer + +# Phase 2: Extraction +python src/runner.py \ + --construct_queries \ + --search_queries \ + --pattern password_based_authentication \ + --web_framework fastapi \ + --language python \ + --root_data_dir=./build/volumes/data +``` diff --git a/docs/fastapi_security_pattern_mapping.md b/docs/fastapi_security_pattern_mapping.md new file mode 100644 index 0000000..be17672 --- /dev/null +++ b/docs/fastapi_security_pattern_mapping.md @@ -0,0 +1,1559 @@ +# FastAPI Security Features and Van den Berghe's Security Patterns + +## Overview + +This document maps FastAPI's built-in security features to the authentication patterns from Van den Berghe's security pattern catalogue, showing how to implement each pattern and which additional libraries are needed for complete implementations. + +--- + +## 1. Password-Based Authentication + +### Pattern Summary +Subjects authenticate by providing an identifier (username/email) and password. The system verifies the password hash against stored credentials. + +### FastAPI Implementation + +**Built-in FastAPI Components:** +- `OAuth2PasswordRequestForm` / `OAuth2PasswordRequestFormStrict` - Collects username and password from form data +- `OAuth2PasswordBearer` - Acts as the Enforcer, requiring authentication for endpoints +- `HTTPBasic` / `HTTPBasicCredentials` - Alternative for HTTP Basic authentication + +**Pattern Role Mapping:** +- **Subject**: The API client/user +- **Enforcer**: `OAuth2PasswordBearer` dependency +- **Verification Manager + Comparator + Hasher**: Custom implementation (external library needed) +- **Password Store**: Database with external ORM (SQLAlchemy, Tortoise-ORM, etc.) +- **Registrar**: Custom endpoint with password hashing + +### Required External Libraries + +```python +# Password hashing (Hasher role) +from passlib.context import CryptContext + +# Or use bcrypt directly +import bcrypt + +# Database (Password Store role) +from sqlalchemy.orm import Session +from sqlalchemy import create_engine + +# Or with async support +from tortoise import fields, models +from tortoise.contrib.fastapi import register_tortoise +``` + +### Complete Implementation Example + +```python +from fastapi import FastAPI, Depends, HTTPException, status +from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm +from passlib.context import CryptContext +from pydantic import BaseModel +from datetime import datetime, timedelta +from jose import JWTError, jwt +from typing import Optional + +# Password hashing (Hasher + Comparator) +pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto") + +# OAuth2 scheme (Enforcer) +oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token") + +app = FastAPI() + +# Mock database (Password Store) +fake_users_db = { + "john": { + "username": "john", + "hashed_password": pwd_context.hash("secret"), + "email": "john@example.com" + } +} + +class User(BaseModel): + username: str + email: Optional[str] = None + +# Verification Manager functions +def verify_password(plain_password: str, hashed_password: str) -> bool: + """Comparator role""" + return pwd_context.verify(plain_password, hashed_password) + +def get_password_hash(password: str) -> str: + """Hasher role""" + return pwd_context.hash(password) + +def authenticate_user(username: str, password: str): + """Verification Manager role""" + user = fake_users_db.get(username) + if not user: + return False + if not verify_password(password, user["hashed_password"]): + return False + return user + +# Login endpoint (Registrar for tokens) +@app.post("/token") +async def login(form_data: OAuth2PasswordRequestForm = Depends()): + """Subject registration - issues tokens after password verification""" + user = authenticate_user(form_data.username, form_data.password) + if not user: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Incorrect username or password", + headers={"WWW-Authenticate": "Bearer"}, + ) + # Return token (typically JWT for stateless auth) + access_token = create_access_token(data={"sub": user["username"]}) + return {"access_token": access_token, "token_type": "bearer"} + +# Protected endpoint (demonstrates Enforcer) +@app.get("/users/me") +async def read_users_me(token: str = Depends(oauth2_scheme)): + """Enforcer ensures authentication before access""" + # Token verification would happen here + return {"username": "john"} + +# Registration endpoint (Registrar for new users) +@app.post("/register") +async def register_user(username: str, password: str, email: str): + """Registrar role - adds new credentials to Password Store""" + if username in fake_users_db: + raise HTTPException(status_code=400, detail="Username already exists") + + # Hash password before storage + hashed_password = get_password_hash(password) + + # Store in database (Password Store) + fake_users_db[username] = { + "username": username, + "hashed_password": hashed_password, + "email": email + } + return {"message": "User created successfully"} +``` + +### Pattern Considerations Met + +✅ **Password hashing**: Uses Passlib/bcrypt for secure hashing +✅ **Salt**: Automatically handled by bcrypt +✅ **Pepper**: Can be added via encryption layer (see below) +✅ **Password policy**: Implement with custom validators +✅ **Error messages**: Generic messages prevent user enumeration + +### Adding Pepper Support + +```python +from cryptography.fernet import Fernet + +class PasswordManager: + def __init__(self, pepper_key: bytes): + """Pepper Store and Encrypter roles""" + self.cipher = Fernet(pepper_key) + self.pwd_context = CryptContext(schemes=["bcrypt"]) + + def hash_and_encrypt(self, password: str) -> bytes: + """Hash with salt, then encrypt with pepper""" + hashed = self.pwd_context.hash(password) + encrypted = self.cipher.encrypt(hashed.encode()) + return encrypted + + def decrypt_and_verify(self, password: str, stored_pwd: bytes) -> bool: + """Decrypt, then verify password""" + try: + decrypted = self.cipher.decrypt(stored_pwd).decode() + return self.pwd_context.verify(password, decrypted) + except: + return False +``` + +--- + +## 2. Verifiable Token-Based Authentication (JWT) + +### Pattern Summary +Subjects authenticate using self-contained tokens (JWTs) that include identity information and are cryptographically signed. The system verifies token integrity without storing them. + +### FastAPI Implementation + +**Built-in FastAPI Components:** +- `HTTPBearer` - Enforcer for Bearer token authentication +- `OAuth2PasswordBearer` - Can also be used as Enforcer + +**Pattern Role Mapping:** +- **Subject**: API client with JWT +- **Enforcer**: `HTTPBearer` or `OAuth2PasswordBearer` dependency +- **Verifier + Cryptographer**: JWT library (python-jose, PyJWT) +- **Key Manager**: Custom implementation or secrets management +- **Registrar**: Login endpoint that issues JWTs +- **Token Generator**: JWT encoding function + +### Required External Libraries + +```python +# JWT handling (Verifier, Cryptographer, Token Generator) +from jose import JWTError, jwt +# Alternative: import jwt as PyJWT + +# For key management +from cryptography.hazmat.primitives import serialization +from cryptography.hazmat.primitives.asymmetric import rsa +``` + +### Complete Implementation Example + +```python +from fastapi import FastAPI, Depends, HTTPException, status +from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials +from jose import JWTError, jwt +from datetime import datetime, timedelta +from pydantic import BaseModel +from typing import Optional + +app = FastAPI() + +# Key Manager configuration +SECRET_KEY = "your-secret-key-keep-it-secret" # For HMAC (MAC-based) +ALGORITHM = "HS256" +ACCESS_TOKEN_EXPIRE_MINUTES = 30 + +# For digital signatures (asymmetric), use RS256: +# ALGORITHM = "RS256" +# PRIVATE_KEY = load_private_key() +# PUBLIC_KEY = load_public_key() + +security = HTTPBearer() # Enforcer + +class TokenData(BaseModel): + username: Optional[str] = None + +class User(BaseModel): + username: str + email: Optional[str] = None + disabled: Optional[bool] = False + +# Token Generator role +def create_access_token(data: dict, expires_delta: Optional[timedelta] = None): + """ + Generates verifiable token with: + - Principal (username) + - Expiration date + - Signature for integrity + """ + to_encode = data.copy() + if expires_delta: + expire = datetime.utcnow() + expires_delta + else: + expire = datetime.utcnow() + timedelta(minutes=15) + + to_encode.update({"exp": expire, "iat": datetime.utcnow()}) + + # Cryptographer role - sign the token + encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM) + return encoded_jwt + +# Verifier role +def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)): + """ + Verifies: + 1. Token signature (integrity) + 2. Token expiration + 3. Extracts principal + """ + token = credentials.credentials + + credentials_exception = HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Could not validate credentials", + headers={"WWW-Authenticate": "Bearer"}, + ) + + try: + # Cryptographer role - verify signature and decode + payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM]) + username: str = payload.get("sub") + + if username is None: + raise credentials_exception + + token_data = TokenData(username=username) + + except JWTError: + raise credentials_exception + + return token_data + +# Login endpoint (Registrar) +@app.post("/token") +async def login_for_token(username: str, password: str): + """Issues new JWT token after authentication""" + # Authenticate user (typically with password-based auth) + user = authenticate_user(username, password) # From previous pattern + + if not user: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Incorrect username or password" + ) + + # Generate token + access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES) + access_token = create_access_token( + data={"sub": user["username"]}, + expires_delta=access_token_expires + ) + + return {"access_token": access_token, "token_type": "bearer"} + +# Protected endpoint with Enforcer +@app.get("/users/me") +async def read_current_user(token_data: TokenData = Depends(verify_token)): + """Enforcer ensures valid JWT before access""" + # Principal is now verified and available + return {"username": token_data.username} + +# Example with token refresh +@app.post("/token/refresh") +async def refresh_token(token_data: TokenData = Depends(verify_token)): + """Issues new token for authenticated user""" + new_token = create_access_token(data={"sub": token_data.username}) + return {"access_token": new_token, "token_type": "bearer"} +``` + +### Using Digital Signatures (RS256) Instead of MAC (HS256) + +```python +from cryptography.hazmat.primitives import serialization +from cryptography.hazmat.primitives.asymmetric import rsa +from cryptography.hazmat.backends import default_backend + +# Key Manager - generate keys +private_key = rsa.generate_private_key( + public_exponent=65537, + key_size=2048, + backend=default_backend() +) +public_key = private_key.public_key() + +# Serialize for storage +private_pem = private_key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.PKCS8, + encryption_algorithm=serialization.NoEncryption() +) + +public_pem = public_key.public_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PublicFormat.SubjectPublicKeyInfo +) + +# Use RS256 algorithm +ALGORITHM = "RS256" + +def create_access_token_rs256(data: dict): + """Token Generator with digital signature""" + to_encode = data.copy() + expire = datetime.utcnow() + timedelta(minutes=30) + to_encode.update({"exp": expire}) + + # Sign with private key + encoded_jwt = jwt.encode(to_encode, private_pem, algorithm=ALGORITHM) + return encoded_jwt + +def verify_token_rs256(token: str): + """Verifier with digital signature verification""" + try: + # Verify with public key + payload = jwt.decode(token, public_pem, algorithms=[ALGORITHM]) + return payload + except JWTError: + raise HTTPException(status_code=401, detail="Invalid token") +``` + +### Pattern Considerations Met + +✅ **Token integrity**: JWT signature ensures no tampering +✅ **Token lifetime**: `exp` claim enforces expiration +✅ **Self-contained**: Principal in token (no server-side storage) +✅ **Stateless**: No session storage needed +⚠️ **Token revocation**: Requires additional implementation (see below) + +### Adding Token Revocation Support + +```python +from typing import Set +from datetime import datetime + +# Token Manager for revoked tokens +class TokenBlacklist: + """Tracks revoked but not yet expired tokens""" + def __init__(self): + self.revoked_tokens: Set[str] = set() + + def revoke(self, token: str): + self.revoked_tokens.add(token) + + def is_revoked(self, token: str) -> bool: + return token in self.revoked_tokens + + def cleanup_expired(self, current_time: datetime): + """Remove expired tokens from blacklist""" + # Implementation would decode tokens and check exp + pass + +blacklist = TokenBlacklist() + +def verify_token_with_revocation(credentials: HTTPAuthorizationCredentials = Depends(security)): + """Enhanced verifier checking revocation list""" + token = credentials.credentials + + # Check blacklist first + if blacklist.is_revoked(token): + raise HTTPException(status_code=401, detail="Token has been revoked") + + # Then verify normally + return verify_token(credentials) + +@app.post("/logout") +async def logout(credentials: HTTPAuthorizationCredentials = Depends(security)): + """Revoke current token""" + token = credentials.credentials + blacklist.revoke(token) + return {"message": "Successfully logged out"} +``` + +--- + +## 3. Opaque Token-Based Authentication + +### Pattern Summary +Subjects authenticate using opaque tokens (session IDs) that the system generates and tracks. The token itself contains no information; the system maps tokens to principals. + +### FastAPI Implementation + +**Built-in FastAPI Components:** +- `APIKeyCookie` - For cookie-based session tokens (Enforcer) +- `APIKeyHeader` - For header-based session tokens (Enforcer) +- `OAuth2PasswordBearer` - Can also act as Enforcer + +**Pattern Role Mapping:** +- **Subject**: API client with session token +- **Enforcer**: `APIKeyCookie`, `APIKeyHeader`, or custom dependency +- **Verifier**: Custom session verification logic +- **Principal Provider**: Session storage (Redis, database) +- **Token Generator**: Secure random generator (`secrets` module) +- **Registrar**: Login endpoint that creates sessions + +### Required External Libraries + +```python +# Session storage (Principal Provider) +import redis.asyncio as redis +from redis import Redis + +# Or in-memory with TTL +from cachetools import TTLCache + +# Secure token generation +import secrets +import hashlib +``` + +### Complete Implementation Example + +```python +from fastapi import FastAPI, Depends, HTTPException, status, Response, Cookie +from fastapi.security import APIKeyCookie +from typing import Optional, Dict +import secrets +import hashlib +from datetime import datetime, timedelta +import redis.asyncio as redis + +app = FastAPI() + +# Principal Provider - session storage +class SessionManager: + """ + Principal Provider role + Manages mapping of tokens to principals + """ + def __init__(self, redis_client: redis.Redis): + self.redis = redis_client + self.session_timeout = 1800 # 30 minutes + self.absolute_timeout = 43200 # 12 hours + + async def create_session(self, principal: str) -> str: + """ + Token Generator + Registrar roles + Generates secure opaque token + """ + # Generate cryptographically secure token (128 bits = 16 bytes) + token = secrets.token_urlsafe(32) # Base64 URL-safe encoding + + session_data = { + "principal": principal, + "created_at": datetime.utcnow().isoformat(), + "last_activity": datetime.utcnow().isoformat() + } + + # Store in Redis with expiration + await self.redis.setex( + f"session:{token}", + self.absolute_timeout, + str(session_data) + ) + + return token + + async def get_principal(self, token: str) -> Optional[str]: + """ + Verifier role + Retrieves principal for valid token + """ + session_data = await self.redis.get(f"session:{token}") + + if not session_data: + return None + + # Parse session data + import ast + data = ast.literal_eval(session_data.decode()) + + # Check activity timeout + last_activity = datetime.fromisoformat(data["last_activity"]) + if (datetime.utcnow() - last_activity).seconds > self.session_timeout: + await self.invalidate_session(token) + return None + + # Update last activity + data["last_activity"] = datetime.utcnow().isoformat() + await self.redis.setex( + f"session:{token}", + self.absolute_timeout, + str(data) + ) + + return data["principal"] + + async def invalidate_session(self, token: str): + """Explicitly invalidate a session (logout)""" + await self.redis.delete(f"session:{token}") + + async def invalidate_all_sessions(self, principal: str): + """Invalidate all sessions for a principal""" + # Scan for all sessions belonging to principal + async for key in self.redis.scan_iter(match="session:*"): + session_data = await self.redis.get(key) + if session_data: + import ast + data = ast.literal_eval(session_data.decode()) + if data.get("principal") == principal: + await self.redis.delete(key) + +# Initialize Redis connection +redis_client = None + +@app.on_event("startup") +async def startup(): + global redis_client + redis_client = await redis.from_url("redis://localhost") + +@app.on_event("shutdown") +async def shutdown(): + await redis_client.close() + +# Create session manager +def get_session_manager() -> SessionManager: + return SessionManager(redis_client) + +# Cookie-based session (Enforcer) +cookie_scheme = APIKeyCookie(name="session_id") + +# Verifier dependency +async def get_current_user( + session_id: str = Depends(cookie_scheme), + session_manager: SessionManager = Depends(get_session_manager) +): + """ + Enforcer + Verifier roles + Validates session and extracts principal + """ + principal = await session_manager.get_principal(session_id) + + if principal is None: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired session" + ) + + return principal + +# Login endpoint (Registrar) +@app.post("/login") +async def login( + username: str, + password: str, + response: Response, + session_manager: SessionManager = Depends(get_session_manager) +): + """ + Registrar role + Creates new session after password authentication + """ + # Authenticate user (using password-based pattern) + user = authenticate_user(username, password) + + if not user: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid credentials" + ) + + # Create session + session_id = await session_manager.create_session(username) + + # Set cookie with security attributes + response.set_cookie( + key="session_id", + value=session_id, + httponly=True, # Prevent JavaScript access + secure=True, # HTTPS only + samesite="lax", # CSRF protection + max_age=43200 # 12 hours + ) + + return {"message": "Login successful"} + +# Protected endpoint +@app.get("/users/me") +async def get_current_user_profile(principal: str = Depends(get_current_user)): + """Enforcer ensures valid session""" + return {"username": principal} + +# Logout endpoint +@app.post("/logout") +async def logout( + response: Response, + session_id: str = Depends(cookie_scheme), + session_manager: SessionManager = Depends(get_session_manager) +): + """Invalidate session""" + await session_manager.invalidate_session(session_id) + response.delete_cookie("session_id") + return {"message": "Logged out successfully"} + +# Force logout from all devices +@app.post("/logout/all") +async def logout_all_sessions( + principal: str = Depends(get_current_user), + session_manager: SessionManager = Depends(get_session_manager) +): + """Invalidate all sessions for current user""" + await session_manager.invalidate_all_sessions(principal) + return {"message": "Logged out from all devices"} +``` + +### Alternative: In-Memory Session Storage + +```python +from cachetools import TTLCache +import threading + +class InMemorySessionManager: + """ + Principal Provider using in-memory storage + Suitable for single-server deployments + """ + def __init__(self): + self.sessions = TTLCache(maxsize=10000, ttl=1800) + self.lock = threading.Lock() + + def create_session(self, principal: str) -> str: + token = secrets.token_urlsafe(32) + with self.lock: + self.sessions[token] = { + "principal": principal, + "created_at": datetime.utcnow() + } + return token + + def get_principal(self, token: str) -> Optional[str]: + with self.lock: + session = self.sessions.get(token) + return session["principal"] if session else None + + def invalidate_session(self, token: str): + with self.lock: + self.sessions.pop(token, None) +``` + +### Pattern Considerations Met + +✅ **Unpredictable tokens**: Uses `secrets.token_urlsafe()` (CSPRNG) +✅ **Entropy**: 32 bytes = 256 bits (exceeds 64-bit minimum) +✅ **Activity timeout**: Updates last activity time +✅ **Absolute timeout**: Maximum session duration enforced +✅ **Token lifetime management**: Redis TTL + activity checks +✅ **Session fixation prevention**: New token on re-authentication +✅ **Secure storage**: HttpOnly, Secure, SameSite cookies + +--- + +## 4. Session-Based Access Control + +### Pattern Summary +Combines opaque token authentication with authorization. Session ID authenticates the user, and their privileges are checked for each action. + +### FastAPI Implementation + +**Built-in FastAPI Components:** +- `APIKeyCookie` - Session token enforcement +- `SecurityScopes` - For scope-based authorization +- Custom dependencies for authorization + +**Pattern Role Mapping:** +- **Subject**: API client with session +- **Authentication Enforcer**: `APIKeyCookie` + session verification +- **Verifier**: Session validation logic +- **Session Manager**: Redis/database session storage +- **Authorization Enforcer**: Custom authorization dependency +- **Decider**: Permission checking logic +- **Policy Provider**: Database with user permissions + +### Required External Libraries + +```python +import redis.asyncio as redis +from sqlalchemy.orm import Session +from enum import Enum +from typing import List, Set +``` + +### Complete Implementation Example + +```python +from fastapi import FastAPI, Depends, HTTPException, status +from fastapi.security import APIKeyCookie +from typing import Optional, Set, List +from enum import Enum +import redis.asyncio as redis +from pydantic import BaseModel + +app = FastAPI() + +# Define permissions +class Permission(str, Enum): + READ_USERS = "users:read" + WRITE_USERS = "users:write" + DELETE_USERS = "users:delete" + READ_POSTS = "posts:read" + WRITE_POSTS = "posts:write" + +class Role(str, Enum): + ADMIN = "admin" + USER = "user" + GUEST = "guest" + +# Session data model +class SessionData(BaseModel): + principal: str + role: Role + permissions: Set[Permission] + +# Policy Provider - stores role-permission mappings +class PolicyProvider: + """ + Manages authorization policies + Maps principals to roles and permissions + """ + def __init__(self): + self.role_permissions = { + Role.ADMIN: { + Permission.READ_USERS, + Permission.WRITE_USERS, + Permission.DELETE_USERS, + Permission.READ_POSTS, + Permission.WRITE_POSTS, + }, + Role.USER: { + Permission.READ_USERS, + Permission.READ_POSTS, + Permission.WRITE_POSTS, + }, + Role.GUEST: { + Permission.READ_POSTS, + } + } + + # Map users to roles (would typically be in database) + self.user_roles = { + "alice": Role.ADMIN, + "bob": Role.USER, + "charlie": Role.GUEST + } + + def get_user_role(self, principal: str) -> Role: + """Get role for a principal""" + return self.user_roles.get(principal, Role.GUEST) + + def get_role_permissions(self, role: Role) -> Set[Permission]: + """Get privileges for a role""" + return self.role_permissions.get(role, set()) + + def get_user_permissions(self, principal: str) -> Set[Permission]: + """Get all privileges for a principal""" + role = self.get_user_role(principal) + return self.get_role_permissions(role) + +# Enhanced Session Manager with authorization data +class SessionManager: + """ + Session Manager role + Stores session with principal and permissions + """ + def __init__(self, redis_client: redis.Redis, policy_provider: PolicyProvider): + self.redis = redis_client + self.policy_provider = policy_provider + self.session_timeout = 1800 + + async def create_session(self, principal: str) -> str: + """Create session with authorization data""" + token = secrets.token_urlsafe(32) + + # Get user permissions from Policy Provider + role = self.policy_provider.get_user_role(principal) + permissions = self.policy_provider.get_user_permissions(principal) + + session_data = SessionData( + principal=principal, + role=role, + permissions=permissions + ) + + await self.redis.setex( + f"session:{token}", + self.session_timeout, + session_data.json() + ) + + return token + + async def get_session(self, token: str) -> Optional[SessionData]: + """Retrieve session with authorization data""" + data = await self.redis.get(f"session:{token}") + + if not data: + return None + + return SessionData.parse_raw(data) + + async def invalidate_session(self, token: str): + await self.redis.delete(f"session:{token}") + +# Authentication Enforcer +cookie_scheme = APIKeyCookie(name="session_id") + +# Combined Verifier + Decider +class AuthorizationChecker: + """ + Combines authentication verification and authorization decision + """ + def __init__(self, required_permissions: Optional[List[Permission]] = None): + self.required_permissions = required_permissions or [] + + async def __call__( + self, + session_id: str = Depends(cookie_scheme), + session_manager: SessionManager = Depends(get_session_manager) + ) -> SessionData: + """ + 1. Authentication: Verify session (Verifier role) + 2. Authorization: Check permissions (Decider role) + """ + # Authentication verification + session = await session_manager.get_session(session_id) + + if session is None: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired session" + ) + + # Authorization decision + if self.required_permissions: + missing_perms = set(self.required_permissions) - session.permissions + if missing_perms: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail=f"Missing required permissions: {missing_perms}" + ) + + return session + +# Helper to create permission checkers +def require_permissions(permissions: List[Permission]): + """Factory for creating authorization dependencies""" + return AuthorizationChecker(required_permissions=permissions) + +# Registrar - login with session creation +@app.post("/login") +async def login( + username: str, + password: str, + response: Response, + session_manager: SessionManager = Depends(get_session_manager) +): + """Create authenticated session with authorization data""" + # Authenticate user + user = authenticate_user(username, password) + + if not user: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid credentials" + ) + + # Create session (includes authorization data) + session_id = await session_manager.create_session(username) + + response.set_cookie( + key="session_id", + value=session_id, + httponly=True, + secure=True, + samesite="lax", + max_age=1800 + ) + + return {"message": "Login successful"} + +# Public endpoint - no authentication required +@app.get("/posts/public") +async def list_public_posts(): + """No enforcer - public access""" + return {"posts": ["post1", "post2"]} + +# Authenticated endpoint - requires valid session +@app.get("/users/me") +async def get_current_user( + session: SessionData = Depends(AuthorizationChecker()) +): + """Authentication enforcer only - any valid session""" + return { + "username": session.principal, + "role": session.role, + "permissions": list(session.permissions) + } + +# Authorized endpoint - requires specific permission +@app.get("/users") +async def list_users( + session: SessionData = Depends(require_permissions([Permission.READ_USERS])) +): + """ + Both authentication and authorization enforcement + Requires READ_USERS permission + """ + return {"users": ["alice", "bob", "charlie"]} + +# Multiple permissions required +@app.post("/users") +async def create_user( + username: str, + session: SessionData = Depends( + require_permissions([Permission.WRITE_USERS]) + ) +): + """Requires WRITE_USERS permission""" + return {"message": f"User {username} created by {session.principal}"} + +@app.delete("/users/{username}") +async def delete_user( + username: str, + session: SessionData = Depends( + require_permissions([Permission.DELETE_USERS]) + ) +): + """Requires DELETE_USERS permission - admin only""" + return {"message": f"User {username} deleted by {session.principal}"} + +# Resource-based authorization +@app.get("/posts/{post_id}") +async def get_post( + post_id: int, + session: SessionData = Depends( + require_permissions([Permission.READ_POSTS]) + ) +): + """Resource access with permission check""" + # Could add additional checks here: + # - Is user the post owner? + # - Is post public/private? + return {"post_id": post_id, "accessed_by": session.principal} + +@app.put("/posts/{post_id}") +async def update_post( + post_id: int, + content: str, + session: SessionData = Depends( + require_permissions([Permission.WRITE_POSTS]) + ) +): + """Update requires WRITE_POSTS permission""" + # Additional check: is user the post owner? + return {"post_id": post_id, "updated_by": session.principal} + +# Logout +@app.post("/logout") +async def logout( + response: Response, + session_id: str = Depends(cookie_scheme), + session_manager: SessionManager = Depends(get_session_manager) +): + """Invalidate session""" + await session_manager.invalidate_session(session_id) + response.delete_cookie("session_id") + return {"message": "Logged out"} +``` + +### Advanced: Attribute-Based Access Control (ABAC) + +```python +from typing import Callable, Dict, Any + +class ABACDecider: + """ + Advanced authorization decider using attributes + """ + def __init__( + self, + policy_function: Callable[[SessionData, Dict[str, Any]], bool] + ): + self.policy_function = policy_function + + async def __call__( + self, + session: SessionData = Depends(AuthorizationChecker()), + **context + ): + """ + Evaluate policy based on: + - Subject attributes (from session) + - Resource attributes (from context) + - Action attributes (from context) + - Environment attributes (time, IP, etc.) + """ + if not self.policy_function(session, context): + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Access denied by policy" + ) + return session + +# Example ABAC policy +def can_edit_post_policy(session: SessionData, context: Dict[str, Any]) -> bool: + """ + Allow if: + - User is admin, OR + - User is post owner AND post is not locked + """ + post_owner = context.get("post_owner") + post_locked = context.get("post_locked", False) + + if session.role == Role.ADMIN: + return True + + if session.principal == post_owner and not post_locked: + return True + + return False + +# Use ABAC +@app.put("/posts/{post_id}/abac") +async def update_post_abac( + post_id: int, + session: SessionData = Depends(ABACDecider(can_edit_post_policy)) +): + """ABAC-based authorization""" + # Get post details and pass to policy + # This would typically come from database + context = { + "post_owner": "alice", + "post_locked": False + } + return {"message": "Post updated"} +``` + +### Pattern Considerations Met + +✅ **Authentication first**: Session verified before authorization +✅ **Session-based**: Opaque token for authentication +✅ **Authorization per action**: Permissions checked per endpoint +✅ **Role-based permissions**: Policy provider maps roles to permissions +✅ **Resource protection**: Both authentication and authorization enforced +✅ **Session management**: Proper session lifecycle + +--- + +## 5. Obscure Token-Based Access Control (API Keys) + +### Pattern Summary +Long-lived, secret tokens that combine authentication and authorization. Common for API keys and Personal Access Tokens (PATs). + +### FastAPI Implementation + +**Built-in FastAPI Components:** +- `APIKeyHeader` - For API keys in headers (Enforcer) +- `APIKeyQuery` - For API keys in query parameters (Enforcer) +- Custom security schemes + +**Pattern Role Mapping:** +- **Subject**: API client with API key +- **Enforcer**: `APIKeyHeader` or `APIKeyQuery` +- **Validator**: Combined authentication + authorization logic +- **Hasher**: Hash function for token storage +- **Token Manager**: Database storing token hashes and permissions +- **Registrar**: Endpoint to generate new API keys +- **Token Generator**: Secure random generator + +### Required External Libraries + +```python +import secrets +import hashlib +from sqlalchemy import create_engine, Column, String, JSON, DateTime +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker +``` + +### Complete Implementation Example + +```python +from fastapi import FastAPI, Depends, HTTPException, status, Security +from fastapi.security import APIKeyHeader +from typing import Optional, Set, List +from pydantic import BaseModel +from datetime import datetime, timedelta +import secrets +import hashlib +from sqlalchemy import Column, String, JSON, DateTime, create_engine +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import Session, sessionmaker + +app = FastAPI() + +# Database setup for Token Manager +Base = declarative_base() + +class APIKey(Base): + """Token Manager storage""" + __tablename__ = "api_keys" + + id = Column(String, primary_key=True) + token_hash = Column(String, unique=True, nullable=False) + principal = Column(String, nullable=False) + name = Column(String) # User-friendly name + permissions = Column(JSON) # List of permissions + created_at = Column(DateTime, default=datetime.utcnow) + expires_at = Column(DateTime, nullable=True) + last_used = Column(DateTime, nullable=True) + +engine = create_engine("sqlite:///./api_keys.db") +Base.metadata.create_all(engine) +SessionLocal = sessionmaker(bind=engine) + +def get_db(): + db = SessionLocal() + try: + yield db + finally: + db.close() + +# Models +class TokenInfo(BaseModel): + principal: str + permissions: Set[str] + name: Optional[str] = None + +# Token Manager implementation +class APIKeyManager: + """ + Token Manager role + Manages API keys with hashing + """ + def __init__(self, db: Session): + self.db = db + + @staticmethod + def generate_token() -> str: + """ + Token Generator role + Generates cryptographically secure token + At least 128 bits (16 bytes) for security + """ + return secrets.token_urlsafe(32) # 256 bits + + @staticmethod + def hash_token(token: str) -> str: + """ + Hasher role + One-way hash for secure storage + """ + return hashlib.sha256(token.encode()).hexdigest() + + def create_api_key( + self, + principal: str, + permissions: List[str], + name: Optional[str] = None, + expires_delta: Optional[timedelta] = None + ) -> str: + """ + Registrar role + Creates new API key with permissions + """ + # Generate token (shown to user only once) + token = self.generate_token() + + # Hash for storage (evidence) + token_hash = self.hash_token(token) + + # Calculate expiration + expires_at = None + if expires_delta: + expires_at = datetime.utcnow() + expires_delta + + # Store in database + api_key = APIKey( + id=secrets.token_urlsafe(16), + token_hash=token_hash, + principal=principal, + name=name, + permissions=permissions, + expires_at=expires_at + ) + + self.db.add(api_key) + self.db.commit() + + # Return plaintext token (shown only once) + return token + + def validate_token(self, token: str) -> Optional[TokenInfo]: + """ + Validator role + Verifies token and returns principal + permissions + """ + # Hash the provided token + token_hash = self.hash_token(token) + + # Look up in database + api_key = self.db.query(APIKey).filter( + APIKey.token_hash == token_hash + ).first() + + if not api_key: + return None + + # Check expiration + if api_key.expires_at and datetime.utcnow() > api_key.expires_at: + return None + + # Update last used timestamp + api_key.last_used = datetime.utcnow() + self.db.commit() + + return TokenInfo( + principal=api_key.principal, + permissions=set(api_key.permissions), + name=api_key.name + ) + + def revoke_token(self, token: str) -> bool: + """Revoke an API key""" + token_hash = self.hash_token(token) + api_key = self.db.query(APIKey).filter( + APIKey.token_hash == token_hash + ).first() + + if api_key: + self.db.delete(api_key) + self.db.commit() + return True + return False + + def list_user_keys(self, principal: str) -> List[APIKey]: + """List all API keys for a user""" + return self.db.query(APIKey).filter( + APIKey.principal == principal + ).all() + +# Security scheme - Enforcer +api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False) + +# Validator dependency +class APIKeyValidator: + """ + Enforcer + Validator roles + Validates API key and checks permissions + """ + def __init__(self, required_permissions: Optional[List[str]] = None): + self.required_permissions = required_permissions or [] + + async def __call__( + self, + api_key: Optional[str] = Security(api_key_header), + db: Session = Depends(get_db) + ) -> TokenInfo: + """ + 1. Validate API key (authentication) + 2. Check permissions (authorization) + """ + if not api_key: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Missing API key" + ) + + # Validate token + manager = APIKeyManager(db) + token_info = manager.validate_token(api_key) + + if not token_info: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired API key" + ) + + # Check authorization + if self.required_permissions: + missing = set(self.required_permissions) - token_info.permissions + if missing: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail=f"Missing permissions: {missing}" + ) + + return token_info + +# Helper function +def require_api_permissions(permissions: List[str]): + """Factory for creating API key validators with permissions""" + return APIKeyValidator(required_permissions=permissions) + +# API endpoints + +# Generate new API key (requires user authentication) +@app.post("/api-keys") +async def create_api_key( + name: str, + permissions: List[str], + expires_days: Optional[int] = None, + principal: str = Depends(get_authenticated_user), # From password auth + db: Session = Depends(get_db) +): + """ + Registrar endpoint + Creates new API key (shown only once) + """ + manager = APIKeyManager(db) + + expires_delta = None + if expires_days: + expires_delta = timedelta(days=expires_days) + + token = manager.create_api_key( + principal=principal, + permissions=permissions, + name=name, + expires_delta=expires_delta + ) + + return { + "api_key": token, + "message": "Save this key securely. It won't be shown again.", + "permissions": permissions, + "expires_in_days": expires_days + } + +# List user's API keys +@app.get("/api-keys") +async def list_api_keys( + principal: str = Depends(get_authenticated_user), + db: Session = Depends(get_db) +): + """List all API keys for authenticated user""" + manager = APIKeyManager(db) + keys = manager.list_user_keys(principal) + + return { + "api_keys": [ + { + "id": key.id, + "name": key.name, + "permissions": key.permissions, + "created_at": key.created_at, + "expires_at": key.expires_at, + "last_used": key.last_used + } + for key in keys + ] + } + +# Revoke API key +@app.delete("/api-keys/{key_id}") +async def revoke_api_key( + key_id: str, + principal: str = Depends(get_authenticated_user), + db: Session = Depends(get_db) +): + """Revoke an API key""" + # Verify ownership + api_key = db.query(APIKey).filter( + APIKey.id == key_id, + APIKey.principal == principal + ).first() + + if not api_key: + raise HTTPException(status_code=404, detail="API key not found") + + db.delete(api_key) + db.commit() + + return {"message": "API key revoked"} + +# Public endpoint - no API key required +@app.get("/public") +async def public_endpoint(): + """No enforcer - public access""" + return {"message": "This is public"} + +# Protected endpoint - requires valid API key +@app.get("/protected") +async def protected_endpoint( + token_info: TokenInfo = Depends(APIKeyValidator()) +): + """Requires any valid API key""" + return { + "message": "Access granted", + "principal": token_info.principal, + "permissions": list(token_info.permissions) + } + +# Endpoint requiring specific permissions +@app.get("/data/read") +async def read_data( + token_info: TokenInfo = Depends( + require_api_permissions(["data:read"]) + ) +): + """Requires 'data:read' permission""" + return {"data": "sensitive information", "accessed_by": token_info.principal} + +@app.post("/data/write") +async def write_data( + content: str, + token_info: TokenInfo = Depends( + require_api_permissions(["data:write"]) + ) +): + """Requires 'data:write' permission""" + return {"message": "Data written", "by": token_info.principal} + +@app.delete("/data/delete") +async def delete_data( + token_info: TokenInfo = Depends( + require_api_permissions(["data:delete"]) + ) +): + """Requires 'data:delete' permission - highly privileged""" + return {"message": "Data deleted", "by": token_info.principal} + +# Sensitive action - API key should NOT be allowed +@app.post("/account/delete") +async def delete_account( + principal: str = Depends(get_authenticated_user) # Requires password auth +): + """ + Sensitive action - requires full authentication + NOT accessible with API key + """ + return {"message": f"Account {principal} deleted"} +``` + +### Pattern Considerations Met + +✅ **Unpredictable tokens**: Uses `secrets` module (CSPRNG) +✅ **Sufficient entropy**: 256 bits (exceeds 64-bit requirement) +✅ **Hash storage**: Stores SHA-256 hash, not plaintext +✅ **Long-lived**: Optional expiration dates +✅ **Combined auth + authz**: Single token validates and authorizes +✅ **Revocable**: Users can revoke keys +✅ **Limited privileges**: Scoped permissions per key +✅ **Sensitive actions restricted**: Password auth required for critical operations + +--- + +## Summary Matrix + +| Pattern | FastAPI Built-in | Required Libraries | Implementation Complexity | +|---------|-----------------|-------------------|-------------------------| +| **Password-Based** | `OAuth2PasswordRequestForm`, `OAuth2PasswordBearer` | `passlib`, `bcrypt`, database ORM | Medium | +| **Verifiable Token (JWT)** | `HTTPBearer`, `OAuth2PasswordBearer` | `python-jose` or `PyJWT` | Low-Medium | +| **Opaque Token** | `APIKeyCookie`, `APIKeyHeader` | `redis` or in-memory cache | Medium | +| **Session-Based Access Control** | `APIKeyCookie`, `SecurityScopes` | `redis`, database ORM | High | +| **Obscure Token (API Keys)** | `APIKeyHeader`, `APIKeyQuery` | database ORM, `secrets` | Medium-High | + +## Key Library Recommendations + +### Password Hashing +```bash +pip install passlib[bcrypt] +# or +pip install bcrypt +``` + +### JWT Handling +```bash +pip install python-jose[cryptography] +# or +pip install PyJWT cryptography +``` + +### Session Storage +```bash +pip install redis +# or for async +pip install redis[asyncio] +``` + +### Database +```bash +pip install sqlalchemy +# or for async +pip install sqlalchemy[asyncio] +# or +pip install tortoise-orm +``` + +### Additional Security +```bash +pip install cryptography # For encryption, key management +pip install python-multipart # For form data +``` + +## Best Practices Across All Patterns + +1. **Always use HTTPS** - All tokens/credentials must be transmitted securely +2. **HTTPOnly cookies** - Prevent XSS attacks on session tokens +3. **SameSite cookies** - Prevent CSRF attacks +4. **Rate limiting** - Use `slowapi` or similar +5. **Logging** - Log all authentication/authorization failures +6. **Secret management** - Use environment variables or secret managers +7. **Token rotation** - Implement refresh tokens for long sessions +8. **Input validation** - Use Pydantic models +9. **Error messages** - Generic messages to prevent enumeration +10. **Security headers** - Use middleware for HSTS, CSP, etc. + diff --git a/docs/pattern_to_dependencies_signatures_mapping.md b/docs/pattern_to_dependencies_signatures_mapping.md new file mode 100644 index 0000000..a148440 --- /dev/null +++ b/docs/pattern_to_dependencies_signatures_mapping.md @@ -0,0 +1,305 @@ +# Security Pattern to Dependencies and Signatures Mapping Table + +This table maps Van den Berghe's security patterns to their implementation dependencies and specific function signatures/API calls for each pattern role. + +--- + +## 1. Password-Based Authentication + +| Role | Dependencies | Signature/API Usage | +|------|--------------|---------------------| +| **Enforcer** | `fastapi` | `OAuth2PasswordBearer(tokenUrl="token")`
`OAuth2PasswordRequestForm = Depends()`
`HTTPBasic()` | +| **Verification Manager** | `passlib` | `pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")`
`pwd_context.verify(plain_password, hashed_password)` | +| **Comparator** | `passlib` | `pwd_context.verify(plain_password, hashed_password)` | +| **Hasher** | `passlib`, `bcrypt` | `pwd_context.hash(password)`
Alternative: `bcrypt.hashpw(password.encode(), bcrypt.gensalt())` | +| **Password Store** | `sqlalchemy`, `tortoise-orm` | `session.query(User).filter(User.username == username).first()`
`User.objects.get(username=username)` | +| **Pepper Store** | `cryptography` | `Fernet(pepper_key)`
`cipher.encrypt(hashed_password.encode())` | +| **Encrypter** | `cryptography` | `from cryptography.fernet import Fernet`
`cipher = Fernet(key)`
`cipher.encrypt(data)` | +| **System** | `fastapi` | `@app.post("/login")`
`async def login(...)` | +| **Registrar** | `fastapi`, `passlib` | `@app.post("/register")`
`hashed = pwd_context.hash(password)`
`db.add(new_user)` | +| **Password Policy** | `pydantic`, custom | `@validator("password")`
`def validate_password(cls, v):`
` if len(v) < 8: raise ValueError(...)` | +| **SRNG** | `secrets` | `secrets.token_urlsafe(32)`
`secrets.token_bytes(32)` | + +**Complete Dependencies:** `fastapi`, `passlib[bcrypt]`, `cryptography`, `sqlalchemy` or `tortoise-orm`, `pydantic` + +--- + +## 2. Verifiable Token-Based Authentication (JWT) + +| Role | Dependencies | Signature/API Usage | +|------|--------------|---------------------| +| **Enforcer** | `fastapi` | `HTTPBearer()`
`OAuth2PasswordBearer(tokenUrl="token")`
`credentials: HTTPAuthorizationCredentials = Depends(security)` | +| **Verifier** | `python-jose`, `PyJWT` | `jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])`
Alternative: `PyJWT.decode(token, public_key, algorithms=["RS256"])` | +| **Cryptographer** (MAC) | `python-jose` | `jwt.encode(data, SECRET_KEY, algorithm="HS256")`
`jwt.decode(token, SECRET_KEY, algorithms=["HS256"])` | +| **Cryptographer** (Digital Signature) | `python-jose`, `cryptography` | `jwt.encode(data, private_key, algorithm="RS256")`
`jwt.decode(token, public_key, algorithms=["RS256"])` | +| **Key Manager** (HMAC) | Built-in | `SECRET_KEY = "your-secret-key"`
Store in environment variables | +| **Key Manager** (RSA) | `cryptography` | `from cryptography.hazmat.primitives.asymmetric import rsa`
`private_key = rsa.generate_private_key(public_exponent=65537, key_size=2048)`
`public_key = private_key.public_key()` | +| **Token Generator** | `python-jose` | `access_token = jwt.encode({"sub": username, "exp": expire}, SECRET_KEY, algorithm="HS256")` | +| **Registrar** | `fastapi`, `python-jose` | `@app.post("/token")`
`token = create_access_token(data={"sub": user.username})`
`return {"access_token": token, "token_type": "bearer"}` | +| **Token Blacklist** (for revocation) | `redis`, `cachetools` | `redis_client.sadd("revoked_tokens", token)`
`redis_client.sismember("revoked_tokens", token)` | + +**Complete Dependencies:** `fastapi`, `python-jose[cryptography]` or `PyJWT`, `cryptography`, optional: `redis` for revocation + +--- + +## 3. Opaque Token-Based Authentication (Session) + +| Role | Dependencies | Signature/API Usage | +|------|--------------|---------------------| +| **Enforcer** | `fastapi` | `APIKeyCookie(name="session_id")`
`APIKeyHeader(name="X-Session-Token")`
`session_id: str = Depends(cookie_scheme)` | +| **Verifier** | Custom with storage backend | `async def verify_session(session_id: str):`
` session = await session_manager.get_session(session_id)`
` if not session: raise HTTPException(401)` | +| **Principal Provider** (Redis) | `redis[asyncio]` | `await redis_client.get(f"session:{token}")`
`await redis_client.setex(f"session:{token}", ttl, data)` | +| **Principal Provider** (In-memory) | `cachetools` | `sessions = TTLCache(maxsize=10000, ttl=1800)`
`sessions[token] = {"principal": username, ...}` | +| **Token Generator** | `secrets` | `session_id = secrets.token_urlsafe(32)`
`session_id = secrets.token_hex(32)` | +| **Registrar** | `fastapi`, `secrets`, storage | `@app.post("/login")`
`session_id = secrets.token_urlsafe(32)`
`await redis.setex(f"session:{session_id}", 1800, user_data)`
`response.set_cookie("session_id", session_id, httponly=True)` | +| **Session Manager** | `redis` or custom | `class SessionManager:`
` async def create_session(self, principal: str) -> str`
` async def get_principal(self, token: str) -> Optional[str]`
` async def invalidate_session(self, token: str)` | + +**Complete Dependencies:** `fastapi`, `redis[asyncio]` or `cachetools`, `secrets` (built-in) + +--- + +## 4. Session-Based Access Control + +| Role | Dependencies | Signature/API Usage | +|------|--------------|---------------------| +| **Authentication Enforcer** | `fastapi` | `APIKeyCookie(name="session_id")`
`session_id: str = Depends(cookie_scheme)` | +| **Verifier** | `redis`, custom | `session = await session_manager.get_session(session_id)`
`if not session: raise HTTPException(401)` | +| **Session Manager** | `redis[asyncio]` | `await redis.setex(f"session:{token}", ttl, session_data.json())`
`data = await redis.get(f"session:{token}")` | +| **Session ID Generator** | `secrets` | `secrets.token_urlsafe(32)` | +| **Authorization Enforcer** | `fastapi`, custom | `class AuthorizationChecker:`
` def __init__(self, required_permissions: List[str])`
` async def __call__(self, session_id: str = Depends(...)): ...` | +| **Decider** | Custom logic | `if required_permissions.issubset(session.permissions):`
` return session`
`raise HTTPException(403)` | +| **Policy Provider** | `sqlalchemy`, `redis` | `db.query(Role).filter(Role.name == role_name).first()`
`role_permissions = policy_provider.get_role_permissions(role)` | +| **Registrar** | `fastapi`, `secrets`, `redis` | `@app.post("/login")`
`session_id = await session_manager.create_session(username)`
`response.set_cookie("session_id", session_id, httponly=True, secure=True)` | + +**Complete Dependencies:** `fastapi`, `redis[asyncio]`, `sqlalchemy` or database ORM, `secrets` (built-in), `pydantic` + +--- + +## 5. Obscure Token-Based Access Control (API Keys) + +| Role | Dependencies | Signature/API Usage | +|------|--------------|---------------------| +| **Enforcer** | `fastapi` | `APIKeyHeader(name="X-API-Key", auto_error=False)`
`APIKeyQuery(name="api_key")`
`api_key: str = Security(api_key_header)` | +| **Validator** (Auth + Authz) | Custom with DB | `class APIKeyValidator:`
` async def __call__(self, api_key: str = Security(...)):`
` token_info = manager.validate_token(api_key)`
` if not token_info: raise HTTPException(401)`
` if missing_perms: raise HTTPException(403)` | +| **Hasher** | `hashlib` | `import hashlib`
`hashlib.sha256(token.encode()).hexdigest()`
`hashlib.blake2b(token.encode()).hexdigest()` | +| **Token Manager** (Database) | `sqlalchemy` | `db.query(APIKey).filter(APIKey.token_hash == hash).first()`
`api_key = APIKey(token_hash=hash, principal=user, permissions=perms)`
`db.add(api_key)` | +| **Token Generator** | `secrets` | `secrets.token_urlsafe(32)`
`secrets.token_hex(32)` | +| **Registrar** | `fastapi`, `secrets`, `sqlalchemy` | `@app.post("/api-keys")`
`token = secrets.token_urlsafe(32)`
`token_hash = hashlib.sha256(token.encode()).hexdigest()`
`db.add(APIKey(token_hash=token_hash, principal=user, permissions=perms))`
`return {"api_key": token}` | +| **Permission Checker** | Custom | `if required_permissions.issubset(token_info.permissions):`
` return token_info`
`raise HTTPException(403, "Missing permissions")` | + +**Complete Dependencies:** `fastapi`, `sqlalchemy` or database ORM, `hashlib` (built-in), `secrets` (built-in) + +--- + +## Cross-Pattern Common Components + +### Database ORMs (Password Store, Token Manager, Policy Provider) + +| Library | Connection | Query | Insert | Update | Delete | +|---------|-----------|-------|--------|--------|--------| +| **SQLAlchemy** | `engine = create_engine("postgresql://...")`
`SessionLocal = sessionmaker(bind=engine)` | `db.query(User).filter(User.username == name).first()` | `db.add(user)`
`db.commit()` | `user.password = new_hash`
`db.commit()` | `db.delete(user)`
`db.commit()` | +| **Tortoise ORM** | `await Tortoise.init(db_url="...", modules={"models": [...]})`
`await Tortoise.generate_schemas()` | `await User.get(username=name)`
`await User.filter(username=name).first()` | `user = User(username=name, ...)`
`await user.save()` | `user.password = new_hash`
`await user.save()` | `await user.delete()` | + +### Redis (Session Storage, Token Blacklist) + +| Operation | Synchronous (redis-py) | Asynchronous (redis[asyncio]) | +|-----------|----------------------|------------------------------| +| **Connect** | `redis.Redis(host="localhost", port=6379, db=0)` | `await redis.from_url("redis://localhost")` | +| **Set with TTL** | `client.setex("key", 3600, "value")` | `await client.setex("key", 3600, "value")` | +| **Get** | `value = client.get("key")` | `value = await client.get("key")` | +| **Delete** | `client.delete("key")` | `await client.delete("key")` | +| **Check exists** | `client.exists("key")` | `await client.exists("key")` | +| **Set add** | `client.sadd("set_key", "member")` | `await client.sadd("set_key", "member")` | +| **Set check** | `client.sismember("set_key", "member")` | `await client.sismember("set_key", "member")` | + +### Cryptography (Hasher, Encrypter, Key Manager) + +| Purpose | Library | Signature/API Usage | +|---------|---------|---------------------| +| **Password Hashing** | `passlib` | `from passlib.context import CryptContext`
`pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")`
`hash = pwd_context.hash(password)`
`is_valid = pwd_context.verify(password, hash)` | +| **Password Hashing** | `bcrypt` | `import bcrypt`
`salt = bcrypt.gensalt()`
`hash = bcrypt.hashpw(password.encode(), salt)`
`is_valid = bcrypt.checkpw(password.encode(), hash)` | +| **General Hashing** | `hashlib` | `import hashlib`
`hash = hashlib.sha256(data.encode()).hexdigest()`
`hash = hashlib.blake2b(data.encode()).hexdigest()` | +| **Symmetric Encryption** | `cryptography` | `from cryptography.fernet import Fernet`
`key = Fernet.generate_key()`
`cipher = Fernet(key)`
`encrypted = cipher.encrypt(data.encode())`
`decrypted = cipher.decrypt(encrypted).decode()` | +| **RSA Key Generation** | `cryptography` | `from cryptography.hazmat.primitives.asymmetric import rsa`
`private_key = rsa.generate_private_key(public_exponent=65537, key_size=2048)` | +| **JWT with HMAC** | `python-jose` | `from jose import jwt`
`token = jwt.encode({"sub": user}, SECRET_KEY, algorithm="HS256")`
`payload = jwt.decode(token, SECRET_KEY, algorithms=["HS256"])` | +| **JWT with RSA** | `python-jose` | `token = jwt.encode({"sub": user}, private_key, algorithm="RS256")`
`payload = jwt.decode(token, public_key, algorithms=["RS256"])` | +| **Random Generation** | `secrets` | `token = secrets.token_urlsafe(32)`
`token = secrets.token_hex(32)`
`random_bytes = secrets.token_bytes(32)` | + +--- + +## FastAPI Security Utilities (Enforcer Role) + +| Security Scheme | Import | Initialization | Usage in Dependency | +|-----------------|--------|----------------|---------------------| +| **OAuth2PasswordBearer** | `from fastapi.security import OAuth2PasswordBearer` | `oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token")` | `async def get_user(token: str = Depends(oauth2_scheme))` | +| **OAuth2PasswordRequestForm** | `from fastapi.security import OAuth2PasswordRequestForm` | N/A | `async def login(form: OAuth2PasswordRequestForm = Depends())` | +| **HTTPBearer** | `from fastapi.security import HTTPBearer` | `security = HTTPBearer()` | `async def verify(creds: HTTPAuthorizationCredentials = Depends(security))` | +| **HTTPBasic** | `from fastapi.security import HTTPBasic` | `security = HTTPBasic()` | `async def verify(creds: HTTPBasicCredentials = Depends(security))` | +| **APIKeyHeader** | `from fastapi.security import APIKeyHeader` | `api_key_header = APIKeyHeader(name="X-API-Key")` | `async def verify(api_key: str = Depends(api_key_header))` | +| **APIKeyCookie** | `from fastapi.security import APIKeyCookie` | `cookie_scheme = APIKeyCookie(name="session_id")` | `async def verify(session: str = Depends(cookie_scheme))` | +| **APIKeyQuery** | `from fastapi.security import APIKeyQuery` | `api_key_query = APIKeyQuery(name="api_key")` | `async def verify(key: str = Depends(api_key_query))` | +| **SecurityScopes** | `from fastapi.security import SecurityScopes` | N/A | `async def check(scopes: SecurityScopes, token: str = Depends(...))` | + +--- + +## Pattern-Specific Zoekt Query Signatures + +These are example search patterns to find implementations in codebases: + +### Password-Based Authentication + +``` +# Find Enforcer implementations +OAuth2PasswordBearer\s*\( +OAuth2PasswordRequestForm\s*=\s*Depends\(\) + +# Find Hasher implementations +CryptContext\s*\(\s*schemes\s*=\s*\[ +pwd_context\.hash\( +bcrypt\.hashpw\( + +# Find Comparator implementations +pwd_context\.verify\( +bcrypt\.checkpw\( + +# Find Registrar endpoints +@app\.post\(["\']/(register|signup) +``` + +### Verifiable Token-Based Authentication (JWT) + +``` +# Find Enforcer implementations +HTTPBearer\s*\(\) +OAuth2PasswordBearer\s*\( + +# Find Token Generator implementations +jwt\.encode\( +create_access_token\( + +# Find Verifier implementations +jwt\.decode\( +verify_token\( + +# Find Key Manager (RSA) +rsa\.generate_private_key\( +private_key\.public_key\(\) +``` + +### Opaque Token-Based Authentication + +``` +# Find Enforcer implementations +APIKeyCookie\s*\( +APIKeyHeader\s*\( + +# Find Token Generator implementations +secrets\.token_urlsafe\( +secrets\.token_hex\( + +# Find Session Manager implementations +redis.*setex.*session +TTLCache\s*\( +session_manager\.create_session\( +session_manager\.get_principal\( +``` + +### Session-Based Access Control + +``` +# Find combined auth+authz implementations +class\s+.*Checker.*:\s*def\s+__init__.*required_permissions +session\.permissions +raise\s+HTTPException\s*\(\s*status_code\s*=\s*(403|status\.HTTP_403_FORBIDDEN) + +# Find Policy Provider implementations +get_role_permissions\( +get_user_permissions\( +policy_provider\.get_privileges\( +``` + +### Obscure Token-Based Access Control (API Keys) + +``` +# Find Enforcer implementations +APIKeyHeader\s*\(.*name\s*=\s*["\']X-API-Key +Security\s*\(\s*api_key_header\s*\) + +# Find Token Manager implementations +APIKey\s*\(.*token_hash +hashlib\.sha256\(.*token +api_key\.permissions + +# Find Validator implementations +validate_token\( +token_hash\s*=.*hashlib +required_permissions.*issubset +``` + +--- + +## Complete Dependency Installation Commands + +```bash +# Password-Based Authentication +pip install fastapi[all] passlib[bcrypt] sqlalchemy cryptography pydantic python-multipart + +# Verifiable Token-Based Authentication (JWT) +pip install fastapi[all] python-jose[cryptography] cryptography + +# Opaque Token-Based Authentication +pip install fastapi[all] redis[asyncio] +# OR for in-memory +pip install fastapi[all] cachetools + +# Session-Based Access Control +pip install fastapi[all] redis[asyncio] sqlalchemy pydantic + +# Obscure Token-Based Access Control +pip install fastapi[all] sqlalchemy + +# Complete installation (all patterns) +pip install fastapi[all] \ + passlib[bcrypt] \ + python-jose[cryptography] \ + cryptography \ + redis[asyncio] \ + sqlalchemy \ + pydantic \ + python-multipart +``` + +--- + +## Usage Pattern Template + +### General Pattern for Any Security Implementation + +```python +from fastapi import FastAPI, Depends, HTTPException, status +from fastapi.security import [ENFORCER_CLASS] + +app = FastAPI() + +# 1. Initialize Enforcer +enforcer = [ENFORCER_CLASS]([PARAMETERS]) + +# 2. Create Verification Logic +async def verify_credentials([CREDENTIAL_PARAM] = Depends(enforcer)): + # Verification logic using appropriate libraries + [VERIFICATION_CODE] + return [PRINCIPAL_OR_TOKEN_INFO] + +# 3. Protected Endpoint +@app.get("/protected") +async def protected_route([USER_PARAM] = Depends(verify_credentials)): + return {"message": "Access granted", "user": [USER_PARAM]} + +# 4. Registration/Token Generation +@app.post("/[AUTH_ENDPOINT]") +async def authenticate([AUTH_PARAMS]): + # Generate token/session + [TOKEN_GENERATION_CODE] + return {[TOKEN_RESPONSE]} +``` + From bab6a78fd6aa8364c88f046fcfd2d8049e3cc98d Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Sun, 9 Nov 2025 11:07:54 +0000 Subject: [PATCH 22/26] fix: update query syntax for password-based authentication roles --- .../password_based_authentication.yaml | 38 +++++++------------ 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml index 707c5d0..cad0893 100644 --- a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml +++ b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml @@ -15,11 +15,11 @@ roles: enforcer: description: "Ensures the requested action is only performed if the Subject is successfully authenticated" queries: - - query: "Depends OAuth2PasswordRequestForm OAuth2PasswordRequestFormRestrict" + - query: "Depends OR OAuth2PasswordRequestForm OR OAuth2PasswordRequestFormRestrict" description: "Files containing both OAuth2 password authentication classes" priority: high - - query: "Depends HTTPBasic HTTPBasicCredentials" + - query: "Depends OR HTTPBasic OR HTTPBasicCredentials" description: "Files implementing HTTP Basic authentication" priority: medium @@ -29,27 +29,19 @@ roles: - query: "CryptContext pwd_context verify" description: "Files with password verification context" priority: high - - - query: "def verify_password lang:python" - description: "Python functions that verify passwords" - priority: high - - - query: "authenticate_user username password" - description: "User authentication functions" - priority: high comparator: description: "Compares the hash value of the received password against stored hash" queries: - - query: "pwd_context.verify bcrypt.checkpw" + - query: "CryptContext pwd_context verify" description: "Files containing password comparison functions" priority: high - - query: "pwd_context.verify(plain_password, hashed_password)" + - query: "pwd_context verify plain_password hashed_password " description: "Direct password verification calls" priority: high - - query: "bcrypt.checkpw password.encode" + - query: "bcrypt checkpw password encode" description: "Bcrypt password checking" priority: medium @@ -64,7 +56,7 @@ roles: description: "Files configuring bcrypt hashing" priority: high - - query: "pwd_context.hash(password)" + - query: "pwd_context.hash password " description: "Direct password hashing calls" priority: high @@ -87,7 +79,7 @@ roles: description: "Database column for hashed passwords" priority: high - - query: "db.query(User).filter username" + - query: "db.query User.filter username" description: "User lookup queries" priority: medium @@ -97,10 +89,6 @@ roles: - query: "Fernet pepper_key encrypt" description: "Files implementing pepper encryption" priority: medium - - - query: "pepper cipher.encrypt" - description: "Pepper encryption operations" - priority: medium encrypter: description: "Encrypts a given data element using a given cryptographic key" @@ -116,7 +104,7 @@ roles: registrar: description: "Handles the Subject's registration" queries: - - query: "@app.post /register pwd_context.hash" + - query: "app.post register pwd_context.hash" description: "Registration endpoints with password hashing" priority: high @@ -124,7 +112,7 @@ roles: description: "User registration functions" priority: high - - query: "@app.post /signup create_user" + - query: "app.post signup create_user" description: "Signup endpoints" priority: medium @@ -135,9 +123,9 @@ roles: password_policy: description: "Contains the rules any password should satisfy" queries: - - query: "@validator password len" - description: "Pydantic password validators" + - query: " validator password len" priority: high + description: "Pydantic password validators" - query: "class PasswordPolicy validate" description: "Password policy validation classes" @@ -165,12 +153,12 @@ roles: complete_implementation: description: "Queries to find files with complete password authentication implementation" queries: - - query: "OAuth2PasswordBearer CryptContext pwd_context.hash pwd_context.verify lang:python" + - query: "OAuth2PasswordBearer CryptContext pwd_context.hash pwd_context.verify " description: "Complete password authentication pattern" priority: critical min_matches: 4 - - query: "@app.post /login authenticate_user pwd_context lang:python -file:test" + - query: "@app.post /login authenticate_user pwd_context -file:test" description: "Login endpoint with authentication" priority: high min_matches: 3 From 13288cb3a0fb18305212c7fbe43f862756ecd3f1 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Mon, 10 Nov 2025 10:12:54 +0000 Subject: [PATCH 23/26] update recrawl --- .../password_based_authentication.yaml | 2 +- .../src/dependent_miner/base.py | 60 ++++++++++++++++++- security_pattern_miner/src/runner.py | 21 ++++++- 3 files changed, 77 insertions(+), 6 deletions(-) diff --git a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml index cad0893..e166954 100644 --- a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml +++ b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml @@ -9,7 +9,7 @@ dependencies: - cryptography repo_metadata_file: - - python_Pypi_mutual_dependents_fastapi_passlib.jsonl + - python_Pypi_mutual_dependents_fastapi_passlib_cryptography.jsonl roles: enforcer: diff --git a/security_pattern_miner/src/dependent_miner/base.py b/security_pattern_miner/src/dependent_miner/base.py index 3aac681..16a40d8 100644 --- a/security_pattern_miner/src/dependent_miner/base.py +++ b/security_pattern_miner/src/dependent_miner/base.py @@ -25,7 +25,45 @@ def __init__(self, package_manager: str, language: str, config: LibrariesIOConfi self.language = language self.config = config + def has_cleaned_dependents_file(self, package_name: str) -> bool: + """ + Check if a cleaned dependents file already exists for the package. + + Args: + package_name: Name of the package to check + + Returns: + True if cleaned file exists, False otherwise + """ + import os + if not os.path.exists(self.config.dependent_repo_info_save_dir): + return False + + cleaned_file_path = os.path.join( + self.config.dependent_repo_info_save_dir, + f"{self.language}_{self.package_manager}_{package_name}_dependents_{self.config.start_page}_cleaned.jsonl" + ) + + exists = os.path.exists(cleaned_file_path) + if exists: + # Also check if file is not empty + try: + with open(cleaned_file_path, 'r') as f: + first_line = f.readline() + return bool(first_line.strip()) + except Exception: + return False + return False + def get_dependents(self, package_name: str) -> List[DependentRepositoryInfo]: + # Check if cleaned file already exists + if self.has_cleaned_dependents_file(package_name): + logger.info(f"Cleaned dependents file already exists for package {package_name}. Skipping API fetch.") + logger.info(f"To re-fetch, delete the file or use --clean_only to reprocess existing raw files.") + return self.load_saved_dependents(package_name) + + logger.info(f"No cleaned dependents file found for {package_name}. Fetching from Libraries.io API...") + num_pages = self.config.start_page dependents = [] while num_pages <= self.config.start_page + self.config.max_num_pages: @@ -144,17 +182,35 @@ def clean_saved_dependents(self, package_name: str): # Remove duplicated JSON line (dependent ) in previously saved dependents file if exists import os if not os.path.exists(self.config.dependent_repo_info_save_dir): + logger.warning(f"Dependent repo info directory does not exist: {self.config.dependent_repo_info_save_dir}") return - file_path = os.path.join(LibrariesIOConfig.dependent_repo_info_save_dir, f"{self.language}_{self.package_manager}_{package_name}_dependents_{LibrariesIOConfig.start_page}.jsonl") - cleaned_file_path = os.path.join(LibrariesIOConfig.dependent_repo_info_save_dir, f"{self.language}_{self.package_manager}_{package_name}_dependents_{LibrariesIOConfig.start_page}_cleaned.jsonl") + + file_path = os.path.join( + LibrariesIOConfig.dependent_repo_info_save_dir, + f"{self.language}_{self.package_manager}_{package_name}_dependents_{LibrariesIOConfig.start_page}.jsonl" + ) + cleaned_file_path = os.path.join( + LibrariesIOConfig.dependent_repo_info_save_dir, + f"{self.language}_{self.package_manager}_{package_name}_dependents_{LibrariesIOConfig.start_page}_cleaned.jsonl" + ) + if not os.path.exists(file_path): + logger.warning(f"Raw dependents file not found: {file_path}") + # Check if cleaned file already exists + if os.path.exists(cleaned_file_path): + logger.info(f"Cleaned file already exists: {cleaned_file_path}") return + + logger.info(f"Cleaning dependents file for package {package_name}...") unique_dependents = {} with jsonlines.open(file_path, "r") as f: for dep in f: unique_dependents[dep['full_name']] = dep + with jsonlines.open(cleaned_file_path, "w") as f: f.write_all(unique_dependents.values()) + + logger.info(f"Cleaned {len(unique_dependents)} unique dependents from {file_path} to {cleaned_file_path}") def load_saved_dependents(self, package_name: str) -> List[DependentRepositoryInfo]: import os diff --git a/security_pattern_miner/src/runner.py b/security_pattern_miner/src/runner.py index 64060a7..911afd0 100644 --- a/security_pattern_miner/src/runner.py +++ b/security_pattern_miner/src/runner.py @@ -49,18 +49,33 @@ def __init__(self, args): def run(self, package_names: list[str]): if self.args.get_dependents: # Step 0: Get each package's dependents and save to files + logger.info(f"Starting dependent mining for packages: {', '.join(package_names)}") + for pkg in package_names: - self.dependent_miner.get_dependents(pkg) - self.dependent_miner.clean_saved_dependents(pkg) + logger.info(f"\n{'='*60}") + logger.info(f"Processing package: {pkg}") + logger.info(f"{'='*60}") + + # Check if cleaned file exists + if self.dependent_miner.has_cleaned_dependents_file(pkg): + logger.info(f"✓ Package {pkg} already has cleaned dependents file") + logger.info(f" Skipping API fetch. File will be used for mutual dependents calculation.") + else: + logger.info(f"✗ Package {pkg} needs to fetch dependents from Libraries.io API") + self.dependent_miner.get_dependents(pkg) + self.dependent_miner.clean_saved_dependents(pkg) + logger.info(f"✓ Completed fetching and cleaning dependents for {pkg}") if len(package_names) < 2: logger.warning("At least two package names are required to find mutual dependents. Stopping") return if self.args.clean_only: - logger.info("Cleaned saved dependents files. Stopping as --clean_only is set.") + logger.info("Running in clean-only mode...") for pkg in package_names: + logger.info(f"Cleaning dependents for package: {pkg}") self.dependent_miner.clean_saved_dependents(pkg) + logger.info("Cleaned saved dependents files. Stopping as --clean_only is set.") return if self.args.crawl_only: From c3b3d79d94c778d1ced4e9374bdf2b7c8b9cb6dc Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Mon, 10 Nov 2025 13:02:37 +0000 Subject: [PATCH 24/26] update pwd based rules --- .../obscure_token_access_control.yaml | 56 --------- .../patterns/opaque_token_authentication.yaml | 80 ------------- .../password_based_authentication.yaml | 111 ++++++------------ .../session_based_access_control.yaml | 80 ------------- .../verifiable_token_authentication.yaml | 66 +---------- 5 files changed, 40 insertions(+), 353 deletions(-) diff --git a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/obscure_token_access_control.yaml b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/obscure_token_access_control.yaml index 82e0b58..79caee8 100644 --- a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/obscure_token_access_control.yaml +++ b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/obscure_token_access_control.yaml @@ -297,63 +297,7 @@ sensitive_actions: description: "Password change not accessible via API key" priority: high -anti_patterns: - description: "Security anti-patterns to detect" - - plaintext_storage: - query: "Column String api_key -hash -token_hash" - severity: critical - - weak_tokens: - query: "secrets.token_urlsafe(8) secrets.token_urlsafe(16)" - severity: critical - - no_hashing: - query: "api_key = secrets.token db.add -hash" - severity: critical - - predictable_tokens: - query: "uuid.uuid4() random.randint -secrets" - severity: critical - - no_expiration: - query: "class APIKey -expires_at -expiration" - severity: medium - - overprivileged_keys: - query: "permissions = [\"*\"] permissions = [\"all\"]" - severity: high - - api_key_for_sensitive: - query: "@app.delete /account api_key APIKeyHeader" - severity: high -filters: - language: python - file_extension: "\\.py$" - exclude_tests: true - exclude_forks: true - exclude_archived: true - -search_strategy: - steps: - - name: "Find Enforcer" - queries: ["APIKeyHeader", "APIKeyQuery", "X-API-Key"] - - - name: "Find Token Generator" - queries: ["secrets.token_urlsafe", "secrets.token_hex"] - - - name: "Find Hasher" - queries: ["hashlib.sha256", "hash_token"] - - - name: "Find Token Manager" - queries: ["class APIKey", "token_hash principal permissions"] - - - name: "Find Validator" - queries: ["validate_token", "APIKeyValidator"] - - - name: "Verify Security" - queries: ["hashlib secrets db.query", "httponly=False secure=False"] best_practices: token_generation: diff --git a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/opaque_token_authentication.yaml b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/opaque_token_authentication.yaml index ed688b6..b6a37ff 100644 --- a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/opaque_token_authentication.yaml +++ b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/opaque_token_authentication.yaml @@ -238,83 +238,3 @@ storage_backends: - "from cachetools import TTLCache" - "sessions: Dict[str, SessionData]" - "threading.Lock sessions" - -anti_patterns: - description: "Security anti-patterns to detect" - - predictable_tokens: - query: "session_id = str(uuid.uuid4()) -secrets" - severity: critical - - short_tokens: - query: "secrets.token_urlsafe(8) secrets.token_urlsafe(16)" - severity: high - - insecure_cookies: - query: "set_cookie -httponly -secure" - severity: critical - - no_timeout: - query: "set_cookie -max_age -expires" - severity: high - - weak_random: - query: "random.randint session_id" - severity: critical - - session_fixation: - query: "session_id = request.cookies.get -create_session" - severity: high - -filters: - language: python - file_extension: "\\.py$" - exclude_tests: true - exclude_forks: true - exclude_archived: true - -search_strategy: - steps: - - name: "Find Enforcer" - queries: ["APIKeyCookie", "APIKeyHeader"] - - - name: "Find Token Generator" - queries: ["secrets.token_urlsafe", "secrets.token_hex"] - - - name: "Find Session Storage" - queries: ["redis.setex session:", "TTLCache sessions"] - - - name: "Find Session Manager" - queries: ["class SessionManager", "create_session invalidate_session"] - - - name: "Verify Secure Implementation" - queries: ["httponly=True secure=True", "session_timeout"] - -best_practices: - token_generation: - queries: - - "secrets.token_urlsafe(32)" - - "secrets.token_urlsafe(64)" - - "token_bytes(32)" - description: "At least 128 bits (16 bytes) of entropy" - - cookie_security: - queries: - - "httponly=True" - - "secure=True" - - "samesite=\"lax\"" - - "samesite=\"strict\"" - description: "Secure cookie attributes" - - timeout_config: - queries: - - "session_timeout = 1800" - - "absolute_timeout =" - - "max_age =" - description: "Proper timeout configuration" - - session_regeneration: - queries: - - "invalidate_session create_session" - - "new_session_id = secrets" - description: "Session regeneration on privilege change" diff --git a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml index e166954..391a2d6 100644 --- a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml +++ b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml @@ -6,162 +6,123 @@ pattern: dependencies: - fastapi - passlib - - cryptography + - bcrypt repo_metadata_file: - - python_Pypi_mutual_dependents_fastapi_passlib_cryptography.jsonl + - python_Pypi_mutual_dependents_fastapi_passlib_bcrypt.jsonl roles: enforcer: description: "Ensures the requested action is only performed if the Subject is successfully authenticated" queries: - - query: "Depends OR OAuth2PasswordRequestForm OR OAuth2PasswordRequestFormRestrict" - description: "Files containing both OAuth2 password authentication classes" - priority: high - - - query: "Depends OR HTTPBasic OR HTTPBasicCredentials" + - query: "HTTPBasic Depends HTTPBasicCredentials" description: "Files implementing HTTP Basic authentication" - priority: medium verification_manager: description: "Responsible to collect inputs necessary to verify a Subject's password" queries: - - query: "CryptContext pwd_context verify" + - query: "CryptContext verify" description: "Files with password verification context" - priority: high + - query: "pwd_context verify plain_password hashed_password " + description: "Direct password verification calls" + - query: "bcrypt checkpw password encode" + description: "Bcrypt password checking" comparator: description: "Compares the hash value of the received password against stored hash" queries: - query: "CryptContext pwd_context verify" description: "Files containing password comparison functions" - priority: high - - query: "pwd_context verify plain_password hashed_password " description: "Direct password verification calls" - priority: high - - query: "bcrypt checkpw password encode" description: "Bcrypt password checking" - priority: medium hasher: description: "Calculates the hash value for a given input" queries: - - query: "pwd_context.hash bcrypt.hashpw" + - query: "CryptContext hash" description: "Files with password hashing implementations" - priority: high - - - query: "CryptContext schemes bcrypt" - description: "Files configuring bcrypt hashing" - priority: high - - - query: "pwd_context.hash password " + - query: "pwd_context hash password " description: "Direct password hashing calls" - priority: high - - - query: "bcrypt.gensalt bcrypt.hashpw" + - query: "bcrypt gensalt hashpw" description: "Bcrypt salt generation and hashing" - priority: medium password_store: description: "Keeps track of hash values corresponding to each registered identity" queries: - query: "hashed_password User.query" description: "Files with user password storage queries" - priority: high - query: "class User password sqlalchemy" description: "SQLAlchemy User models with password fields" - priority: high - query: "Column String hashed_password" description: "Database column for hashed passwords" - priority: high - query: "db.query User.filter username" description: "User lookup queries" - priority: medium pepper_store: description: "Keeps track of the pepper value(s) used by the system" queries: - query: "Fernet pepper_key encrypt" description: "Files implementing pepper encryption" - priority: medium + encrypter: description: "Encrypts a given data element using a given cryptographic key" queries: - - query: "from cryptography.fernet import Fernet" + - query: "CryptContext encrypt" description: "Files importing Fernet encryption" - priority: medium - - - query: "Fernet.generate_key cipher.encrypt" - description: "Fernet key generation and encryption" - priority: medium registrar: description: "Handles the Subject's registration" queries: - - query: "app.post register pwd_context.hash" - description: "Registration endpoints with password hashing" - priority: high - - - query: "def register_user db.add" + - query: "app post register" + description: "Registration endpoints" + - query: "register user db add" description: "User registration functions" - priority: high - - - query: "app.post signup create_user" + - query: "app post signup create_user" description: "Signup endpoints" - priority: medium - - - query: "new_user = User db.commit" + - query: "new user User db commit" description: "User creation and database commit" - priority: medium password_policy: description: "Contains the rules any password should satisfy" queries: - query: " validator password len" - priority: high description: "Pydantic password validators" - - - query: "class PasswordPolicy validate" + - query: "PasswordPolicy validate" description: "Password policy validation classes" - priority: medium - - query: "password len min_length max_length" description: "Password length validation" priority: medium - - - query: "raise ValueError password must" - description: "Password validation errors" - priority: low - + - query: "password lowercase uppercase" + description: "Password case validation" + srng: description: "Cryptographically secure random number generator" queries: - - query: "secrets.token_urlsafe secrets.token_bytes" + - query: "secrets token_urlsafe token_bytes" description: "Secure random number generation" priority: high - - query: "import secrets token" description: "Secrets module usage for tokens" - priority: medium + - query: "import secrets token" + description: "Secrets module usage for tokens" -complete_implementation: - description: "Queries to find files with complete password authentication implementation" - queries: - - query: "OAuth2PasswordBearer CryptContext pwd_context.hash pwd_context.verify " - description: "Complete password authentication pattern" - priority: critical - min_matches: 4 - - - query: "@app.post /login authenticate_user pwd_context -file:test" - description: "Login endpoint with authentication" - priority: high - min_matches: 3 + resetter: + description: "Handles password reset requests" + queries: + - query: "app post reset-password" + description: "Password reset request endpoints" + - query: "reset_token send_email" + description: "Password reset token generation and email sending" + - query: "app post forgot-password" + description: "Forgot password endpoints" + - query: "def reset_password token new_password" + description: "Password reset functions" endpoints: login: diff --git a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/session_based_access_control.yaml b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/session_based_access_control.yaml index ad7a020..9d9e92a 100644 --- a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/session_based_access_control.yaml +++ b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/session_based_access_control.yaml @@ -284,83 +284,3 @@ factories_and_helpers: - "def require_auth_and_authz" - "verify_credentials check_permissions" -anti_patterns: - description: "Security anti-patterns to detect" - - authorization_bypass: - query: "if True: pass # TODO: implement authorization" - severity: critical - - permission_disabled: - query: "ENABLE_AUTHORIZATION = False" - severity: critical - - role_hardcoded: - query: "if username == \"admin\"" - severity: high - - no_permission_check: - query: "@app.delete -required_permissions -Depends" - severity: high - - session_without_permissions: - query: "class SessionData principal -permissions -role" - severity: medium - -filters: - language: python - file_extension: "\\.py$" - exclude_tests: true - exclude_forks: true - exclude_archived: true - -search_strategy: - steps: - - name: "Find Authentication Enforcer" - queries: ["APIKeyCookie session_id"] - - - name: "Find Authorization Enforcer" - queries: ["class AuthorizationChecker", "require_permissions"] - - - name: "Find Session Manager" - queries: ["class SessionManager permissions role"] - - - name: "Find Policy Provider" - queries: ["class PolicyProvider", "get_role_permissions"] - - - name: "Find Permission Checking" - queries: ["required_permissions.issubset", "HTTPException 403"] - - - name: "Verify Complete Implementation" - queries: ["SessionData permissions HTTPException 403"] - -integration_patterns: - session_with_permissions: - queries: - - "session_data = SessionData(principal=username, role=role, permissions=perms)" - - "get_user_permissions principal" - - login_flow: - queries: - - "authenticate_user -> get_user_role -> get_user_permissions -> create_session" - - "@app.post /login authenticate_user policy_provider" - - protected_endpoint_flow: - queries: - - "verify_session -> check_permissions -> execute_action" - - "Depends(verify) Depends(authorize)" - -rbac_vs_abac: - rbac: - description: "Role-Based Access Control patterns" - queries: - - "role_permissions: Dict[Role, Set[Permission]]" - - "if session.role == Role.ADMIN" - - "get_role_permissions(role)" - - abac: - description: "Attribute-Based Access Control patterns" - queries: - - "policy_function(session: SessionData, context: Dict[str, Any])" - - "subject_attributes resource_attributes action_attributes" - - "evaluate_policy(subject, resource, action, environment)" diff --git a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/verifiable_token_authentication.yaml b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/verifiable_token_authentication.yaml index efa9a69..91e9dcb 100644 --- a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/verifiable_token_authentication.yaml +++ b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/verifiable_token_authentication.yaml @@ -22,11 +22,11 @@ roles: description: "HTTP authorization credentials dependency" priority: high - - query: "security = HTTPBearer()" + - query: "security = HTTPBearer" description: "HTTPBearer security scheme initialization" priority: high - - - query: "credentials: HTTPAuthorizationCredentials = Depends(security)" + + - query: "credentials HTTPAuthorizationCredentials = Depends security" description: "Bearer token dependency injection" priority: high @@ -41,7 +41,7 @@ roles: description: "Token verification functions" priority: high - - query: "jwt.decode(token, SECRET_KEY, algorithms" + - query: "jwt.decode token, SECRET_KEY, algorithms" description: "Direct JWT decode calls" priority: high @@ -251,62 +251,4 @@ token_features: - "RS384" - "RS512" -anti_patterns: - description: "Security anti-patterns to detect" - - weak_algorithm: - query: "algorithm = \"none\" jwt.encode" - severity: critical - - missing_expiration: - query: "jwt.encode -exp -expires" - severity: high - - hardcoded_secret: - query: "SECRET_KEY = \"my-secret-key\"" - severity: critical - - weak_secret: - query: "SECRET_KEY = \"secret\" \"test\"" - severity: critical - - no_algorithm_verification: - query: "jwt.decode -algorithms verify_signature=False" - severity: critical - -filters: - language: python - file_extension: "\\.py$" - exclude_tests: true - exclude_forks: true - exclude_archived: true -search_strategy: - steps: - - name: "Find Enforcer" - queries: ["HTTPBearer", "OAuth2PasswordBearer"] - - - name: "Find Token Generator" - queries: ["create_access_token", "jwt.encode"] - - - name: "Find Verifier" - queries: ["jwt.decode", "verify_token"] - - - name: "Find Key Management" - queries: ["SECRET_KEY", "rsa.generate_private_key"] - - - name: "Verify Complete Implementation" - queries: ["HTTPBearer jwt.encode jwt.decode"] - -library_alternatives: - jwt_libraries: - - name: python-jose - queries: - - "from jose import jwt" - - "from jose import JWTError" - - - name: PyJWT - queries: - - "import jwt" - - "import jwt as PyJWT" - - "from jwt import encode, decode" From 11483bfaf81b09d96d7b63deabb2baf1586ff3da Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Mon, 10 Nov 2025 13:59:13 +0000 Subject: [PATCH 25/26] update verifiable token search --- .../verifiable_token_authentication.yaml | 83 +++++++------------ 1 file changed, 30 insertions(+), 53 deletions(-) diff --git a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/verifiable_token_authentication.yaml b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/verifiable_token_authentication.yaml index 91e9dcb..351a80c 100644 --- a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/verifiable_token_authentication.yaml +++ b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/verifiable_token_authentication.yaml @@ -8,48 +8,42 @@ dependencies: - python-jose[cryptography] - PyJWT - cryptography - - redis # optional, for token revocation + +repo_metadata_file: + - python_Pypi_mutual_dependents_fastapi_pyjwt.jsonl + roles: enforcer: description: "Ensures the requested action is only performed if the Subject is successfully authenticated" queries: - - query: "HTTPBearer OAuth2PasswordBearer" + - query: "HTTPBearer Depends" description: "Bearer token enforcement" priority: high - - - query: "HTTPAuthorizationCredentials Depends" + - query: " Depends OAuth2PasswordBearer" description: "HTTP authorization credentials dependency" priority: high - - - query: "security = HTTPBearer" - description: "HTTPBearer security scheme initialization" - priority: high - - - query: "credentials HTTPAuthorizationCredentials = Depends security" - description: "Bearer token dependency injection" - priority: high verifier: description: "Manages the verification of whether a token is valid" queries: - - query: "jwt.decode SECRET_KEY algorithms" + - query: "jwt decode SECRET_KEY algorithms" description: "JWT token verification" priority: high - - query: "def verify_token jwt.decode" + - query: "def verify_token jwt decode" description: "Token verification functions" priority: high - - - query: "jwt.decode token, SECRET_KEY, algorithms" + + - query: "jwt decode token, SECRET_KEY, algorithms" description: "Direct JWT decode calls" priority: high - - - query: "payload = jwt.decode JWTError" + + - query: "payload jwt decode JWTError" description: "JWT decode with error handling" priority: high - - query: "credentials.credentials jwt.decode" + - query: "credentials jwt decode" description: "Extracting and verifying JWT from credentials" priority: medium @@ -64,11 +58,11 @@ roles: description: "Python-jose JWT import" priority: high - - query: "algorithm = \"HS256\" jwt.encode" + - query: "algorithm HS256 jwt.encode" description: "HMAC algorithm configuration" priority: high - - query: "jwt.encode jwt.decode HS256" + - query: "jwt encode decode HS256" description: "Complete HMAC JWT operations" priority: medium @@ -83,7 +77,7 @@ roles: description: "JWT verification with public key" priority: high - - query: "algorithm = \"RS256\" jwt" + - query: "algorithm RS256 jwt" description: "RSA signature algorithm configuration" priority: high @@ -98,7 +92,7 @@ roles: description: "Secret key from environment" priority: high - - query: "SECRET_KEY ALGORITHM = \"HS256\"" + - query: "SECRET_KEY ALGORITHM HS256 " description: "HMAC key and algorithm configuration" priority: high @@ -117,53 +111,53 @@ roles: description: "RSA key management imports" priority: high - - query: "private_key.public_key() serialization" + - query: "private_key public_key serialization" description: "Public key extraction and serialization" priority: high - - query: "serialization.PrivateFormat.PKCS8" + - query: "serialization PrivateFormat PKCS8" description: "Private key serialization" priority: medium - - query: "serialization.PublicFormat.SubjectPublicKeyInfo" + - query: "serialization PublicFormat SubjectPublicKeyInfo" description: "Public key serialization" priority: medium token_generator: description: "Manages the generation of new tokens" queries: - - query: "def create_access_token jwt.encode" + - query: "def create_access_token jwt encode" description: "Access token creation functions" priority: high - - query: "jwt.encode exp sub" + - query: "jwt encode exp sub" description: "JWT encoding with expiration and subject" priority: high - - query: "create_access_token data: dict expires_delta" + - query: "create_access_token data dict expires_delta" description: "Token creation with expiration parameter" priority: high - - query: "timedelta(minutes jwt.encode" + - query: "timedelta minutes jwt encode" description: "Token expiration time calculation" priority: medium - - query: "datetime.utcnow() + expires_delta" + - query: "datetime.utcnow expires_delta" description: "Expiration timestamp calculation" priority: low registrar: description: "Provides the Subject a token after successful authentication" queries: - - query: "@app.post /token create_access_token" + - query: "app post token create_access_token" description: "Token issuance endpoints" priority: high - - query: "return access_token token_type bearer" + - query: "return access token token_type bearer" description: "Token response formatting" priority: high - - query: "@app.post /login jwt.encode" + - query: "app post login jwt.encode" description: "Login endpoint with JWT generation" priority: high @@ -174,11 +168,11 @@ roles: token_blacklist: description: "Tracks revoked but not yet expired tokens (optional feature)" queries: - - query: "redis.sadd revoked_tokens jwt" + - query: "redis sadd revoked_tokens jwt" description: "Token revocation with Redis" priority: medium - - query: "blacklist.revoke token" + - query: "blacklist revoke token" description: "Token blacklist management" priority: medium @@ -190,23 +184,6 @@ roles: description: "Revocation checking" priority: low -complete_implementation: - description: "Queries to find files with complete JWT authentication implementation" - queries: - - query: "HTTPBearer jwt.encode jwt.decode SECRET_KEY lang:python" - description: "Complete JWT authentication pattern" - priority: critical - min_matches: 4 - - - query: "create_access_token jwt.encode exp sub lang:python -file:test" - description: "Token generation with proper claims" - priority: high - min_matches: 3 - - - query: "@app.post /token authenticate_user jwt.encode lang:python" - description: "Token endpoint with authentication" - priority: high - min_matches: 3 endpoints: token_issuance: From ac7abfa5754adda862d93c5279ec0b61b500e2e1 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Tue, 11 Nov 2025 10:58:37 +0000 Subject: [PATCH 26/26] run(vta): docker compose --- docker-compose.yml | 4 +- .../password_based_authentication.yaml | 8 +- .../verifiable_token_authentication.yaml | 132 +++++------------- 3 files changed, 39 insertions(+), 105 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index bd12038..abd26fb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,7 +8,7 @@ services: - ./build/volumes/data:/data env_file: - ./.env - command: ["python", "./runner.py", "--get_dependents", "--package_names", "org.springframework.security:spring-security-core", "--language", "java", "--package_manager", "Maven", "--root_data_dir=/data"] + command: ["python", "./runner.py", "--get_dependents", "--crawl_only", "--package_names", "fastapi", "bcrypt", "--language", "python", "--package_manager", "Pypi", "--root_data_dir=/data"] deploy: resources: limits: @@ -29,7 +29,7 @@ services: - ./.env environment: - ZOEKT_URL=http://zoekt-webserver:6070/api/search - command: ["python", "./runner.py", "--construct_queries", "--search_queries", "--pattern", "password_based_authentication", "--web_framework", "fastapi", "--language", "python", "--root_data_dir=/data"] + command: ["python", "./runner.py", "--construct_queries", "--search_queries", "--pattern", "verifiable_token_authentication", "--web_framework", "fastapi", "--language", "python", "--root_data_dir=/data"] depends_on: - zoekt-webserver deploy: diff --git a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml index 391a2d6..d07958f 100644 --- a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml +++ b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/password_based_authentication.yaml @@ -9,7 +9,7 @@ dependencies: - bcrypt repo_metadata_file: - - python_Pypi_mutual_dependents_fastapi_passlib_bcrypt.jsonl + - python_Pypi_mutual_dependents_fastapi_passlib.jsonl roles: enforcer: @@ -41,12 +41,12 @@ roles: hasher: description: "Calculates the hash value for a given input" queries: - - query: "CryptContext hash" + - query: "passlib CryptContext hash" description: "Files with password hashing implementations" - query: "pwd_context hash password " description: "Direct password hashing calls" - - query: "bcrypt gensalt hashpw" - description: "Bcrypt salt generation and hashing" + - query: "bcrypt hashpw" + description: "Bcrypt hashing" password_store: description: "Keeps track of hash values corresponding to each registered identity" diff --git a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/verifiable_token_authentication.yaml b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/verifiable_token_authentication.yaml index 351a80c..35f81cd 100644 --- a/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/verifiable_token_authentication.yaml +++ b/security_pattern_miner/src/context_retriever/queries_library/python/fastapi/patterns/verifiable_token_authentication.yaml @@ -23,50 +23,53 @@ roles: - query: " Depends OAuth2PasswordBearer" description: "HTTP authorization credentials dependency" priority: high + - query: HTTPAuthorizationCredentials Security + description: "Authorization credentials extraction" + priority: medium + verifier: description: "Manages the verification of whether a token is valid" queries: - - query: "jwt decode SECRET_KEY algorithms" + - query: "jwt decode algorithms" description: "JWT token verification" priority: high - - query: "def verify_token jwt decode" + - query: "jwt ExpiredSignatureError" description: "Token verification functions" priority: high - - query: "jwt decode token, SECRET_KEY, algorithms" + - query: "jwt InvalidTokenError" description: "Direct JWT decode calls" priority: high - - query: "payload jwt decode JWTError" - description: "JWT decode with error handling" - priority: high - - - query: "credentials jwt decode" - description: "Extracting and verifying JWT from credentials" + - query: "jwt.exceptions DecodeError" + description: "JWT decode error handling" + priority: medium + + - query: "revoke.*token" + description: "JWT expired signature error handling" priority: medium - cryptographer_mac: + cryptography_manager: description: "Provides cryptographic primitives for MAC-based tokens (HMAC)" queries: - - query: "jwt.encode HS256 SECRET_KEY" + - query: "jwt encode HS256" description: "JWT encoding with HMAC" priority: high - - - query: "from jose import jwt" - description: "Python-jose JWT import" + + - query: "jwt encode RS256" + description: "RSA signature algorithm configuration" priority: high - - - query: "algorithm HS256 jwt.encode" - description: "HMAC algorithm configuration" + + - query: "generate.*signature" + description: "JWT verification with HMAC" priority: high - - - query: "jwt encode decode HS256" - description: "Complete HMAC JWT operations" - priority: medium - cryptographer_signature: + - query: "verify.*signature" + description: "JWT verification with HMAC" + priority: high + description: "Provides cryptographic primitives for digitally signed tokens (RSA)" queries: - query: "jwt.encode RS256 private_key" @@ -85,66 +88,23 @@ roles: description: "RSA algorithm variants" priority: medium - key_manager_hmac: - description: "Manages cryptographic keys for HMAC tokens" + key_manager: + description: "Rotate keys" queries: - - query: "SECRET_KEY = os.getenv" - description: "Secret key from environment" - priority: high - - - query: "SECRET_KEY ALGORITHM HS256 " - description: "HMAC key and algorithm configuration" - priority: high - - - query: "load_dotenv SECRET_KEY" - description: "Loading secret keys from environment" - priority: medium - - key_manager_rsa: - description: "Manages cryptographic keys for RSA signed tokens" - queries: - - query: "rsa.generate_private_key public_exponent" - description: "RSA key generation" - priority: high - - - query: "from cryptography.hazmat.primitives.asymmetric import rsa" - description: "RSA key management imports" - priority: high - - - query: "private_key public_key serialization" - description: "Public key extraction and serialization" - priority: high - - - query: "serialization PrivateFormat PKCS8" - description: "Private key serialization" - priority: medium - - - query: "serialization PublicFormat SubjectPublicKeyInfo" - description: "Public key serialization" + - query: "rotate.*key" + description: "Private key loading" priority: medium token_generator: description: "Manages the generation of new tokens" queries: - - query: "def create_access_token jwt encode" + - query: "create.*token datetime" description: "Access token creation functions" priority: high - - query: "jwt encode exp sub" - description: "JWT encoding with expiration and subject" - priority: high - - - query: "create_access_token data dict expires_delta" - description: "Token creation with expiration parameter" - priority: high - - - query: "timedelta minutes jwt encode" - description: "Token expiration time calculation" + - query: "secrets token_urlsafe" + description: "Secure token generation" priority: medium - - - query: "datetime.utcnow expires_delta" - description: "Expiration timestamp calculation" - priority: low registrar: description: "Provides the Subject a token after successful authentication" @@ -153,36 +113,10 @@ roles: description: "Token issuance endpoints" priority: high - - query: "return access token token_type bearer" + - query: "return access token bearer" description: "Token response formatting" priority: high - - query: "app post login jwt.encode" - description: "Login endpoint with JWT generation" - priority: high - - - query: "authenticate_user create_access_token" - description: "Authentication followed by token creation" - priority: medium - - token_blacklist: - description: "Tracks revoked but not yet expired tokens (optional feature)" - queries: - - query: "redis sadd revoked_tokens jwt" - description: "Token revocation with Redis" - priority: medium - - - query: "blacklist revoke token" - description: "Token blacklist management" - priority: medium - - - query: "class TokenBlacklist revoked_tokens" - description: "Token blacklist class" - priority: medium - - - query: "is_revoked token HTTPException" - description: "Revocation checking" - priority: low endpoints: