From d0390f061aeac2a330ae1b89b2c81a75c428aabc Mon Sep 17 00:00:00 2001 From: Mads Nylund Date: Mon, 5 Aug 2024 23:52:13 +0200 Subject: [PATCH 1/2] first version of embedding and uploading to vector db --- package.json | 3 + pnpm-lock.yaml | 234 +++++++++++++++++++++++++++++++++++++- rag/lib/client.js | 17 +++ rag/lib/client.ts | 20 ++++ rag/lib/markdown.js | 22 ++++ rag/lib/markdown.ts | 33 ++++++ rag/lib/markdownParser.js | 36 ++++++ rag/lib/markdownParser.ts | 56 +++++++++ rag/lib/openAI.js | 54 +++++++++ rag/lib/openAI.ts | 14 +++ rag/lib/splitter.js | 19 ++++ rag/lib/splitter.ts | 22 ++++ rag/lib/supabase.js | 86 ++++++++++++++ rag/lib/supabase.ts | 36 ++++++ rag/main.js | 57 ++++++++++ rag/main.ts | 20 ++++ rag/query.js | 78 +++++++++++++ rag/query.ts | 49 ++++++++ tsconfig.json | 2 +- 19 files changed, 856 insertions(+), 2 deletions(-) create mode 100644 rag/lib/client.js create mode 100644 rag/lib/client.ts create mode 100644 rag/lib/markdown.js create mode 100644 rag/lib/markdown.ts create mode 100644 rag/lib/markdownParser.js create mode 100644 rag/lib/markdownParser.ts create mode 100644 rag/lib/openAI.js create mode 100644 rag/lib/openAI.ts create mode 100644 rag/lib/splitter.js create mode 100644 rag/lib/splitter.ts create mode 100644 rag/lib/supabase.js create mode 100644 rag/lib/supabase.ts create mode 100644 rag/main.js create mode 100644 rag/main.ts create mode 100644 rag/query.js create mode 100644 rag/query.ts diff --git a/package.json b/package.json index f0ee2ff..62db6d7 100644 --- a/package.json +++ b/package.json @@ -16,6 +16,7 @@ "@markdoc/markdoc": "^0.4.0", "@markdoc/next.js": "^0.3.4", "@sindresorhus/slugify": "^2.1.0", + "@supabase/supabase-js": "^2.45.0", "@tailwindcss/forms": "^0.5.7", "@tailwindcss/typography": "^0.5.7", "@tanstack/react-query": "^5.29.0", @@ -27,6 +28,7 @@ "autoprefixer": "^10.4.12", "clsx": "^2.1.0", "date-fns": "^3.6.0", + "dotenv": "^16.4.5", "fast-glob": "^3.2.12", "flexsearch": "^0.7.31", "formik": "^2.4.5", @@ -34,6 +36,7 @@ "next": "^14.0.4", "next-auth": "^4.24.7", "next-themes": "^0.2.1", + "openai": "^4.54.0", "prism-react-renderer": "^2.0.6", "react": "^18.2.0", "react-dom": "^18.2.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ea277b7..ee6e8ef 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -23,6 +23,9 @@ dependencies: '@sindresorhus/slugify': specifier: ^2.1.0 version: 2.2.1 + '@supabase/supabase-js': + specifier: ^2.45.0 + version: 2.45.0 '@tailwindcss/forms': specifier: ^0.5.7 version: 0.5.7(tailwindcss@3.4.3) @@ -56,6 +59,9 @@ dependencies: date-fns: specifier: ^3.6.0 version: 3.6.0 + dotenv: + specifier: ^16.4.5 + version: 16.4.5 fast-glob: specifier: ^3.2.12 version: 3.3.2 @@ -77,6 +83,9 @@ dependencies: next-themes: specifier: ^0.2.1 version: 0.2.1(next@14.1.4)(react-dom@18.2.0)(react@18.2.0) + openai: + specifier: ^4.54.0 + version: 4.54.0 prism-react-renderer: specifier: ^2.0.6 version: 2.3.1(react@18.2.0) @@ -775,6 +784,63 @@ packages: escape-string-regexp: 5.0.0 dev: false + /@supabase/auth-js@2.64.4: + resolution: {integrity: sha512-9ITagy4WP4FLl+mke1rchapOH0RQpf++DI+WSG2sO1OFOZ0rW3cwAM0nCrMOxu+Zw4vJ4zObc08uvQrXx590Tg==} + dependencies: + '@supabase/node-fetch': 2.6.15 + dev: false + + /@supabase/functions-js@2.4.1: + resolution: {integrity: sha512-8sZ2ibwHlf+WkHDUZJUXqqmPvWQ3UHN0W30behOJngVh/qHHekhJLCFbh0AjkE9/FqqXtf9eoVvmYgfCLk5tNA==} + dependencies: + '@supabase/node-fetch': 2.6.15 + dev: false + + /@supabase/node-fetch@2.6.15: + resolution: {integrity: sha512-1ibVeYUacxWYi9i0cf5efil6adJ9WRyZBLivgjs+AUpewx1F3xPi7gLgaASI2SmIQxPoCEjAsLAzKPgMJVgOUQ==} + engines: {node: 4.x || >=6.0.0} + dependencies: + whatwg-url: 5.0.0 + dev: false + + /@supabase/postgrest-js@1.15.8: + resolution: {integrity: sha512-YunjXpoQjQ0a0/7vGAvGZA2dlMABXFdVI/8TuVKtlePxyT71sl6ERl6ay1fmIeZcqxiuFQuZw/LXUuStUG9bbg==} + dependencies: + '@supabase/node-fetch': 2.6.15 + dev: false + + /@supabase/realtime-js@2.10.2: + resolution: {integrity: sha512-qyCQaNg90HmJstsvr2aJNxK2zgoKh9ZZA8oqb7UT2LCh3mj9zpa3Iwu167AuyNxsxrUE8eEJ2yH6wLCij4EApA==} + dependencies: + '@supabase/node-fetch': 2.6.15 + '@types/phoenix': 1.6.5 + '@types/ws': 8.5.12 + ws: 8.18.0 + transitivePeerDependencies: + - bufferutil + - utf-8-validate + dev: false + + /@supabase/storage-js@2.6.0: + resolution: {integrity: sha512-REAxr7myf+3utMkI2oOmZ6sdplMZZ71/2NEIEMBZHL9Fkmm3/JnaOZVSRqvG4LStYj2v5WhCruCzuMn6oD/Drw==} + dependencies: + '@supabase/node-fetch': 2.6.15 + dev: false + + /@supabase/supabase-js@2.45.0: + resolution: {integrity: sha512-j66Mfs8RhzCQCKxKogAFQYH9oNhRmgIdKk6pexguI2Oc7hi+nL9UNJug5aL1tKnBdaBM3h65riPLQSdL6sWa3Q==} + dependencies: + '@supabase/auth-js': 2.64.4 + '@supabase/functions-js': 2.4.1 + '@supabase/node-fetch': 2.6.15 + '@supabase/postgrest-js': 1.15.8 + '@supabase/realtime-js': 2.10.2 + '@supabase/storage-js': 2.6.0 + transitivePeerDependencies: + - bufferutil + - utf-8-validate + dev: false + /@swc/helpers@0.5.2: resolution: {integrity: sha512-E4KcWTpoLHqwPHLxidpOqQbcrZVgi0rsmmZXUle1jXmJfuIf/UWpczUJ7MZZ5tlxytgJXyp0w4PGkkeLiuIdZw==} dependencies: @@ -916,12 +982,29 @@ packages: resolution: {integrity: sha512-nG96G3Wp6acyAgJqGasjODb+acrI7KltPiRxzHPXnP3NgI28bpQDRv53olbqGXbfcgF5aiiHmO3xpwEpS5Ld9g==} dev: false + /@types/node-fetch@2.6.11: + resolution: {integrity: sha512-24xFj9R5+rfQJLRyM56qh+wnVSYhyXC2tkoBndtY0U+vubqNsYXGjufB2nn8Q6gt0LrARwL6UBtMCSVCwl4B1g==} + dependencies: + '@types/node': 20.12.5 + form-data: 4.0.0 + dev: false + + /@types/node@18.19.43: + resolution: {integrity: sha512-Mw/YlgXnyJdEwLoFv2dpuJaDFriX+Pc+0qOBJ57jC1H6cDxIj2xc5yUrdtArDVG0m+KV6622a4p2tenEqB3C/g==} + dependencies: + undici-types: 5.26.5 + dev: false + /@types/node@20.12.5: resolution: {integrity: sha512-BD+BjQ9LS/D8ST9p5uqBxghlN+S42iuNxjsUGjeZobe/ciXzk2qb1B6IXc6AnRLS+yFJRpN2IPEHMzwspfDJNw==} dependencies: undici-types: 5.26.5 dev: false + /@types/phoenix@1.6.5: + resolution: {integrity: sha512-xegpDuR+z0UqG9fwHqNoy3rI7JDlvaPh2TY47Fl80oq6g+hXT+c/LEuE43X48clZ6lOfANl5WrPur9fYO1RJ/w==} + dev: false + /@types/prismjs@1.26.3: resolution: {integrity: sha512-A0D0aTXvjlqJ5ZILMz3rNfDBOx9hHxLZYv2by47Sm/pqW35zzjusrZTryatjN/Rf8Us2gZrJD+KeHbUSTux1Cw==} dev: false @@ -961,6 +1044,12 @@ packages: resolution: {integrity: sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ==} dev: false + /@types/ws@8.5.12: + resolution: {integrity: sha512-3tPRkv1EtkDpzlgyKyI8pGsGZAGPEaXeu0DOj5DI25Ja91bdAYddYHbADRYVrZMRbfW+1l5YwXVDKohDJNQxkQ==} + dependencies: + '@types/node': 20.12.5 + dev: false + /@typescript-eslint/parser@6.21.0(eslint@8.57.0)(typescript@5.4.4): resolution: {integrity: sha512-tbsV1jPne5CkFQCgPBcDOt30ItF7aJoZL997JSF7MhGQqOeT3svWRYxiqlfA5RUdlHN6Fi+EI9bxqbdyAUZjYQ==} engines: {node: ^16.0.0 || >=18.0.0} @@ -1094,6 +1183,13 @@ packages: /@ungap/structured-clone@1.2.0: resolution: {integrity: sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==} + /abort-controller@3.0.0: + resolution: {integrity: sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==} + engines: {node: '>=6.5'} + dependencies: + event-target-shim: 5.0.1 + dev: false + /acorn-jsx@5.3.2(acorn@8.11.3): resolution: {integrity: sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==} peerDependencies: @@ -1108,6 +1204,13 @@ packages: hasBin: true dev: true + /agentkeepalive@4.5.0: + resolution: {integrity: sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==} + engines: {node: '>= 8.0.0'} + dependencies: + humanize-ms: 1.2.1 + dev: false + /ajv@6.12.6: resolution: {integrity: sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==} dependencies: @@ -1286,6 +1389,10 @@ packages: resolution: {integrity: sha512-OH/2E5Fg20h2aPrbe+QL8JZQFko0YZaF+j4mnQ7BGhfavO7OpSLa8a0y9sBwomHdSbkhTS8TQNayBfnW5DwbvQ==} dev: true + /asynckit@0.4.0: + resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==} + dev: false + /autoprefixer@10.4.19(postcss@8.4.38): resolution: {integrity: sha512-BaENR2+zBZ8xXhM4pUaKUxlVdxZ0EZhjvbopwnXmxRUfqDmwSpC2lAi/QXvx7NRdPCo1WKEcEF6mV64si1z4Ew==} engines: {node: ^10 || ^12 || >=14} @@ -1481,6 +1588,13 @@ packages: color-string: 1.9.1 dev: true + /combined-stream@1.0.8: + resolution: {integrity: sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==} + engines: {node: '>= 0.8'} + dependencies: + delayed-stream: 1.0.0 + dev: false + /comma-separated-tokens@2.0.3: resolution: {integrity: sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==} dev: false @@ -1611,6 +1725,11 @@ packages: object-keys: 1.1.1 dev: true + /delayed-stream@1.0.0: + resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==} + engines: {node: '>=0.4.0'} + dev: false + /dequal@2.0.3: resolution: {integrity: sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==} engines: {node: '>=6'} @@ -1660,6 +1779,11 @@ packages: esutils: 2.0.3 dev: true + /dotenv@16.4.5: + resolution: {integrity: sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==} + engines: {node: '>=12'} + dev: false + /eastasianwidth@0.2.0: resolution: {integrity: sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==} @@ -2102,6 +2226,11 @@ packages: engines: {node: '>=0.10.0'} dev: true + /event-target-shim@5.0.1: + resolution: {integrity: sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==} + engines: {node: '>=6'} + dev: false + /extend@3.0.2: resolution: {integrity: sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==} dev: false @@ -2184,6 +2313,27 @@ packages: cross-spawn: 7.0.3 signal-exit: 4.1.0 + /form-data-encoder@1.7.2: + resolution: {integrity: sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==} + dev: false + + /form-data@4.0.0: + resolution: {integrity: sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==} + engines: {node: '>= 6'} + dependencies: + asynckit: 0.4.0 + combined-stream: 1.0.8 + mime-types: 2.1.35 + dev: false + + /formdata-node@4.4.1: + resolution: {integrity: sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==} + engines: {node: '>= 12.20'} + dependencies: + node-domexception: 1.0.0 + web-streams-polyfill: 4.0.0-beta.3 + dev: false + /formik@2.4.5(react@18.2.0): resolution: {integrity: sha512-Gxlht0TD3vVdzMDHwkiNZqJ7Mvg77xQNfmBRrNtvzcHZs72TJppSTDKHpImCMJZwcWPBJ8jSQQ95GJzXFf1nAQ==} peerDependencies: @@ -2581,6 +2731,12 @@ packages: resolution: {integrity: sha512-bEqo66MRXsUGxWHV5IP0PUiAWwoEjba4VCzg0LjFJBpchPaTfyfCKTG6bc5F8ucKec3q5y6qOdGyYTSBEvhCrg==} dev: false + /humanize-ms@1.2.1: + resolution: {integrity: sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==} + dependencies: + ms: 2.1.3 + dev: false + /ignore@5.3.1: resolution: {integrity: sha512-5Fytz/IraMjqpwfd34ke28PTVMjZjJG2MPn5t7OE4eUCUNf8BAa7b5WUS9/Qvr6mwOQS7Mk6vdsMno5he+T8Xw==} engines: {node: '>= 4'} @@ -3445,6 +3601,18 @@ packages: braces: 3.0.2 picomatch: 2.3.1 + /mime-db@1.52.0: + resolution: {integrity: sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==} + engines: {node: '>= 0.6'} + dev: false + + /mime-types@2.1.35: + resolution: {integrity: sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==} + engines: {node: '>= 0.6'} + dependencies: + mime-db: 1.52.0 + dev: false + /mini-svg-data-uri@1.4.4: resolution: {integrity: sha512-r9deDe9p5FJUPZAk3A59wGH7Ii9YrjjWw0jmw/liSbHl2CHiyXj6FcDXDu2K3TjVAXqiJdaw3xxwlZZr9E6nHg==} hasBin: true @@ -3482,7 +3650,6 @@ packages: /ms@2.1.3: resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==} - dev: true /mz@2.7.0: resolution: {integrity: sha512-z81GNO7nnYMEhrGh9LeymoE4+Yr0Wn5McHIZMK5cfQCl+NDX08sCZgUc9/6MHni9IWuFLm1Z3HTCXu2z9fN62Q==} @@ -3578,6 +3745,23 @@ packages: - babel-plugin-macros dev: false + /node-domexception@1.0.0: + resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==} + engines: {node: '>=10.5.0'} + dev: false + + /node-fetch@2.7.0: + resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==} + engines: {node: 4.x || >=6.0.0} + peerDependencies: + encoding: ^0.1.0 + peerDependenciesMeta: + encoding: + optional: true + dependencies: + whatwg-url: 5.0.0 + dev: false + /node-releases@2.0.14: resolution: {integrity: sha512-y10wOWt8yZpqXmOgRo77WaHEmhYQYGNA6y421PKsKYWEK8aW+cqAphborZDhqfyKrbZEN92CN1X2KbafY2s7Yw==} dev: false @@ -3696,6 +3880,21 @@ packages: wrappy: 1.0.2 dev: true + /openai@4.54.0: + resolution: {integrity: sha512-e/12BdtTtj+tXs7iHm+Dm7H7WjEWnw7O52B2wSfCQ6lD5F6cvjzo7cANXy5TJ1Q3/qc8YRPT5wBTTFtP5sBp1g==} + hasBin: true + dependencies: + '@types/node': 18.19.43 + '@types/node-fetch': 2.6.11 + abort-controller: 3.0.0 + agentkeepalive: 4.5.0 + form-data-encoder: 1.7.2 + formdata-node: 4.4.1 + node-fetch: 2.7.0 + transitivePeerDependencies: + - encoding + dev: false + /openid-client@5.6.5: resolution: {integrity: sha512-5P4qO9nGJzB5PI0LFlhj4Dzg3m4odt0qsJTfyEtZyOlkgpILwEioOhVVJOrS1iVH494S4Ee5OCjjg6Bf5WOj3w==} dependencies: @@ -4670,6 +4869,10 @@ packages: resolution: {integrity: sha512-0a5EOkAUp8D4moMi2W8ZF8jcga7BgZd91O/yabJCFY8az+XSzeGyTKs0Aoo897iV1Nj6guFq8orWDS96z91oGg==} dev: false + /tr46@0.0.3: + resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==} + dev: false + /trim-lines@3.0.1: resolution: {integrity: sha512-kRj8B+YHZCc9kQYdWfJB2/oUl9rA99qbowYYBtr4ui4mZyAQ2JpvVBd/6U2YloATfqBhBTSMhTpgBHtU0Mf3Rg==} dev: false @@ -4903,6 +5106,22 @@ packages: resolution: {integrity: sha512-bKr1DkiNa2krS7qxNtdrtHAmzuYGFQLiQ13TsorsdT6ULTkPLKuu5+GsFpDlg6JFjUTwX2DyhMPG2be8uPrqsQ==} dev: false + /web-streams-polyfill@4.0.0-beta.3: + resolution: {integrity: sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==} + engines: {node: '>= 14'} + dev: false + + /webidl-conversions@3.0.1: + resolution: {integrity: sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==} + dev: false + + /whatwg-url@5.0.0: + resolution: {integrity: sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==} + dependencies: + tr46: 0.0.3 + webidl-conversions: 3.0.1 + dev: false + /which-boxed-primitive@1.0.2: resolution: {integrity: sha512-bwZdv0AKLpplFY2KZRX6TvyuN7ojjr7lwkg6ml0roIy9YeuSr7JS372qlNW18UQYzgYK9ziGcerWqZOmEn9VNg==} dependencies: @@ -4979,6 +5198,19 @@ packages: resolution: {integrity: sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==} dev: true + /ws@8.18.0: + resolution: {integrity: sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==} + engines: {node: '>=10.0.0'} + peerDependencies: + bufferutil: ^4.0.1 + utf-8-validate: '>=5.0.2' + peerDependenciesMeta: + bufferutil: + optional: true + utf-8-validate: + optional: true + dev: false + /yallist@4.0.0: resolution: {integrity: sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==} diff --git a/rag/lib/client.js b/rag/lib/client.js new file mode 100644 index 0000000..f735ddb --- /dev/null +++ b/rag/lib/client.js @@ -0,0 +1,17 @@ +"use strict"; +Object.defineProperty(exports, "__esModule", { value: true }); +exports.openaiClient = void 0; +require('dotenv').config(); +var supabase_js_1 = require("@supabase/supabase-js"); +var openai_1 = require("openai"); +var SUPBASE_URL = process.env.SUPABASE_URL; +var SUPBASE_KEY = process.env.SUPABASE_KEY; +var OPENAI_API_KEY = process.env.OPENAI_API_KEY; +if (!SUPBASE_URL || !SUPBASE_KEY || !OPENAI_API_KEY) { + throw new Error('Missing SUPABASE_URL or SUPABASE_KEY or OPENAI_API_KEY'); +} +var supaBaseClient = (0, supabase_js_1.createClient)(SUPBASE_URL, SUPBASE_KEY); +exports.openaiClient = new openai_1.default({ + apiKey: OPENAI_API_KEY +}); +exports.default = supaBaseClient; diff --git a/rag/lib/client.ts b/rag/lib/client.ts new file mode 100644 index 0000000..5b2c15a --- /dev/null +++ b/rag/lib/client.ts @@ -0,0 +1,20 @@ +require('dotenv').config(); + +import { createClient } from '@supabase/supabase-js'; +import OpenAI from 'openai'; + +const SUPBASE_URL = process.env.SUPABASE_URL; +const SUPBASE_KEY = process.env.SUPABASE_KEY; +const OPENAI_API_KEY = process.env.OPENAI_API_KEY; + +if (!SUPBASE_URL || !SUPBASE_KEY || !OPENAI_API_KEY) { + throw new Error('Missing SUPABASE_URL or SUPABASE_KEY or OPENAI_API_KEY'); +} + +const supaBaseClient = createClient(SUPBASE_URL, SUPBASE_KEY); + +export const openaiClient = new OpenAI({ + apiKey: OPENAI_API_KEY +}); + +export default supaBaseClient; \ No newline at end of file diff --git a/rag/lib/markdown.js b/rag/lib/markdown.js new file mode 100644 index 0000000..9a85661 --- /dev/null +++ b/rag/lib/markdown.js @@ -0,0 +1,22 @@ +"use strict"; +Object.defineProperty(exports, "__esModule", { value: true }); +var fs_1 = require("fs"); +var path_1 = require("path"); +var readMarkdownFiles = function (dir, filesContent) { + if (filesContent === void 0) { filesContent = {}; } + var files = (0, fs_1.readdirSync)(dir); + for (var _i = 0, files_1 = files; _i < files_1.length; _i++) { + var file = files_1[_i]; + var filePath = (0, path_1.join)(dir, file); + var stat = (0, fs_1.statSync)(filePath); + if (stat.isDirectory()) { + readMarkdownFiles(filePath, filesContent); + } + else if ((0, path_1.extname)(file) === '.md') { + var content = (0, fs_1.readFileSync)(filePath, 'utf-8'); + filesContent[filePath] = content; + } + } + return filesContent; +}; +exports.default = readMarkdownFiles; diff --git a/rag/lib/markdown.ts b/rag/lib/markdown.ts new file mode 100644 index 0000000..ec687d3 --- /dev/null +++ b/rag/lib/markdown.ts @@ -0,0 +1,33 @@ +import { readdirSync, readFileSync, statSync } from "fs"; +import { extname, join } from "path"; + + +export type MarkdownDocuments = { + [key: string]: string; +}; + +export type MarkdownDocument = { + path: string; + content: string; +} + +const readMarkdownFiles = (dir: string, filesContent: MarkdownDocuments = {}): MarkdownDocuments => { + const files = readdirSync(dir); + + for (const file of files) { + const filePath = join(dir, file); + const stat = statSync(filePath); + + if (stat.isDirectory()) { + readMarkdownFiles(filePath, filesContent); + } else if (extname(file) === '.md') { + const content = readFileSync(filePath, 'utf-8'); + filesContent[filePath] = content; + } + } + + return filesContent; +} + + +export default readMarkdownFiles; \ No newline at end of file diff --git a/rag/lib/markdownParser.js b/rag/lib/markdownParser.js new file mode 100644 index 0000000..e37fd00 --- /dev/null +++ b/rag/lib/markdownParser.js @@ -0,0 +1,36 @@ +"use strict"; +Object.defineProperty(exports, "__esModule", { value: true }); +var splitter_1 = require("./splitter"); +var parseTitle = function (content) { + var titleMatch = content.match(/^---\s*title:\s*(.*?)\s*---/s); + return titleMatch ? titleMatch[1] : null; +}; +var removeTitle = function (content) { + return content.replace(/^---\s*title:\s*(.*?)\s*---/s, ''); +}; +var parsePath = function (path) { + var normalizedPath = path.replace(/\\/g, '/'); + var docsIndex = normalizedPath.indexOf('/docs/'); + if (docsIndex !== -1) { + var docsPath = normalizedPath.substring(docsIndex + 1); + var lastSlashIndex = docsPath.lastIndexOf('/'); + if (lastSlashIndex !== -1) { + docsPath = docsPath.substring(0, lastSlashIndex); + } + return docsPath; + } + return ''; +}; +var parseMDFile = function (document) { + var title = parseTitle(document.content); + var content = removeTitle(document.content); + var sections = (0, splitter_1.default)(content); + var path = parsePath(document.path); + return sections.map(function (section, index) { return ({ + title: title || path, + url: path, + content: section, + index: index + }); }); +}; +exports.default = parseMDFile; diff --git a/rag/lib/markdownParser.ts b/rag/lib/markdownParser.ts new file mode 100644 index 0000000..fb2a79c --- /dev/null +++ b/rag/lib/markdownParser.ts @@ -0,0 +1,56 @@ +import { MarkdownDocument } from "./markdown"; +import splitContent from "./splitter"; + + +export type Section = { + title: string; + url: string; + content: string; + index: number; +}; + +const parseTitle = (content: string): string | null => { + const titleMatch = content.match(/^---\s*title:\s*(.*?)\s*---/s); + return titleMatch ? titleMatch[1] : null; +}; + +const removeTitle = (content: string): string => { + return content.replace(/^---\s*title:\s*(.*?)\s*---/s, ''); +}; + +const parsePath = (path: string): string => { + const normalizedPath = path.replace(/\\/g, '/'); + + const docsIndex = normalizedPath.indexOf('/docs/'); + + if (docsIndex !== -1) { + let docsPath = normalizedPath.substring(docsIndex + 1); + + const lastSlashIndex = docsPath.lastIndexOf('/'); + if (lastSlashIndex !== -1) { + docsPath = docsPath.substring(0, lastSlashIndex); + } + + return docsPath; + } + + return ''; +} + +const parseMDFile = (document: MarkdownDocument): Section[] => { + const title = parseTitle(document.content); + const content = removeTitle(document.content); + + const sections = splitContent(content); + const path = parsePath(document.path); + + return sections.map((section, index) => ({ + title: title || path, + url: path, + content: section, + index + })); +}; + + +export default parseMDFile; \ No newline at end of file diff --git a/rag/lib/openAI.js b/rag/lib/openAI.js new file mode 100644 index 0000000..61191f7 --- /dev/null +++ b/rag/lib/openAI.js @@ -0,0 +1,54 @@ +"use strict"; +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +var __generator = (this && this.__generator) || function (thisArg, body) { + var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g; + return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g; + function verb(n) { return function (v) { return step([n, v]); }; } + function step(op) { + if (f) throw new TypeError("Generator is already executing."); + while (g && (g = 0, op[0] && (_ = 0)), _) try { + if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t; + if (y = 0, t) op = [op[0] & 2, t.value]; + switch (op[0]) { + case 0: case 1: t = op; break; + case 4: _.label++; return { value: op[1], done: false }; + case 5: _.label++; y = op[1]; op = [0]; continue; + case 7: op = _.ops.pop(); _.trys.pop(); continue; + default: + if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; } + if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; } + if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; } + if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; } + if (t[2]) _.ops.pop(); + _.trys.pop(); continue; + } + op = body.call(thisArg, _); + } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; } + if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true }; + } +}; +Object.defineProperty(exports, "__esModule", { value: true }); +var client_1 = require("./client"); +var embeddContent = function (content) { return __awaiter(void 0, void 0, void 0, function () { + var response; + return __generator(this, function (_a) { + switch (_a.label) { + case 0: return [4 /*yield*/, client_1.openaiClient.embeddings.create({ + model: 'text-embedding-3-small', + input: content + })]; + case 1: + response = _a.sent(); + return [2 /*return*/, response.data[0].embedding]; + } + }); +}); }; +exports.default = embeddContent; diff --git a/rag/lib/openAI.ts b/rag/lib/openAI.ts new file mode 100644 index 0000000..a202ea4 --- /dev/null +++ b/rag/lib/openAI.ts @@ -0,0 +1,14 @@ +import { openaiClient } from "./client"; + + +const embeddContent = async (content: string): Promise => { + const response = await openaiClient.embeddings.create({ + model: 'text-embedding-3-small', + input: content + }) + + return response.data[0].embedding; +}; + + +export default embeddContent; \ No newline at end of file diff --git a/rag/lib/splitter.js b/rag/lib/splitter.js new file mode 100644 index 0000000..ebed483 --- /dev/null +++ b/rag/lib/splitter.js @@ -0,0 +1,19 @@ +"use strict"; +Object.defineProperty(exports, "__esModule", { value: true }); +var splitContent = function (content, tokenSize, overlap) { + if (tokenSize === void 0) { tokenSize = 200; } + if (overlap === void 0) { overlap = 50; } + var text = content.replace(/\n/g, ' '); + var words = text.split(/\s+/); + var sections = []; + for (var i = 0; i < words.length; i += (tokenSize - overlap)) { + var section = words.slice(i, i + tokenSize).join(' '); + sections.push(section); + // Break if the next section would be less than tokenSize - overlap in size + if (i + tokenSize >= words.length) { + break; + } + } + return sections; +}; +exports.default = splitContent; diff --git a/rag/lib/splitter.ts b/rag/lib/splitter.ts new file mode 100644 index 0000000..c7688bf --- /dev/null +++ b/rag/lib/splitter.ts @@ -0,0 +1,22 @@ + + +const splitContent = (content: string, tokenSize: number = 200, overlap: number = 50): string[] => { + const text = content.replace(/\n/g, ' '); + const words = text.split(/\s+/); + const sections: string[] = []; + + for (let i = 0; i < words.length; i += (tokenSize - overlap)) { + const section = words.slice(i, i + tokenSize).join(' '); + sections.push(section); + + // Break if the next section would be less than tokenSize - overlap in size + if (i + tokenSize >= words.length) { + break; + } + } + + return sections; +}; + + +export default splitContent; \ No newline at end of file diff --git a/rag/lib/supabase.js b/rag/lib/supabase.js new file mode 100644 index 0000000..67de9c9 --- /dev/null +++ b/rag/lib/supabase.js @@ -0,0 +1,86 @@ +"use strict"; +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +var __generator = (this && this.__generator) || function (thisArg, body) { + var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g; + return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g; + function verb(n) { return function (v) { return step([n, v]); }; } + function step(op) { + if (f) throw new TypeError("Generator is already executing."); + while (g && (g = 0, op[0] && (_ = 0)), _) try { + if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t; + if (y = 0, t) op = [op[0] & 2, t.value]; + switch (op[0]) { + case 0: case 1: t = op; break; + case 4: _.label++; return { value: op[1], done: false }; + case 5: _.label++; y = op[1]; op = [0]; continue; + case 7: op = _.ops.pop(); _.trys.pop(); continue; + default: + if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; } + if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; } + if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; } + if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; } + if (t[2]) _.ops.pop(); + _.trys.pop(); continue; + } + op = body.call(thisArg, _); + } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; } + if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true }; + } +}; +Object.defineProperty(exports, "__esModule", { value: true }); +var client_1 = require("./client"); +var openAI_1 = require("./openAI"); +var uploadSections = function (sections) { return __awaiter(void 0, void 0, void 0, function () { + var _a, data, error, documentId, _i, sections_1, section, embeddedContent; + return __generator(this, function (_b) { + switch (_b.label) { + case 0: return [4 /*yield*/, client_1.default + .from('documents') + .insert({ + url: sections[0].url, + title: sections[0].title, + tag: 'docs' + }) + .select('id')]; + case 1: + _a = _b.sent(), data = _a.data, error = _a.error; + if (error) { + console.error('Error inserting document:', error); + return [2 /*return*/]; + } + ; + documentId = data[0].id; + _i = 0, sections_1 = sections; + _b.label = 2; + case 2: + if (!(_i < sections_1.length)) return [3 /*break*/, 6]; + section = sections_1[_i]; + return [4 /*yield*/, (0, openAI_1.default)(section.content)]; + case 3: + embeddedContent = _b.sent(); + return [4 /*yield*/, client_1.default + .from('sections') + .insert({ + document_id: documentId, + content: section.content, + embedding: embeddedContent + })]; + case 4: + _b.sent(); + _b.label = 5; + case 5: + _i++; + return [3 /*break*/, 2]; + case 6: return [2 /*return*/]; + } + }); +}); }; +exports.default = uploadSections; diff --git a/rag/lib/supabase.ts b/rag/lib/supabase.ts new file mode 100644 index 0000000..9bf8096 --- /dev/null +++ b/rag/lib/supabase.ts @@ -0,0 +1,36 @@ +import supaBaseClient from "./client"; +import { Section } from "./markdownParser"; +import embeddContent from "./openAI"; + + +const uploadSections = async (sections: Section[]) => { + const { data, error } = await supaBaseClient + .from('documents') + .insert({ + url: sections[0].url, + title: sections[0].title, + tag: 'docs' + }) + .select('id'); + + if (error) { + console.error('Error inserting document:', error); + return; + }; + + const documentId = data[0].id; + + for (const section of sections) { + const embeddedContent = await embeddContent(section.content); + await supaBaseClient + .from('sections') + .insert({ + document_id: documentId, + content: section.content, + embedding: embeddedContent + }) + } +}; + + +export default uploadSections; \ No newline at end of file diff --git a/rag/main.js b/rag/main.js new file mode 100644 index 0000000..e0f7b3a --- /dev/null +++ b/rag/main.js @@ -0,0 +1,57 @@ +"use strict"; +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +var __generator = (this && this.__generator) || function (thisArg, body) { + var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g; + return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g; + function verb(n) { return function (v) { return step([n, v]); }; } + function step(op) { + if (f) throw new TypeError("Generator is already executing."); + while (g && (g = 0, op[0] && (_ = 0)), _) try { + if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t; + if (y = 0, t) op = [op[0] & 2, t.value]; + switch (op[0]) { + case 0: case 1: t = op; break; + case 4: _.label++; return { value: op[1], done: false }; + case 5: _.label++; y = op[1]; op = [0]; continue; + case 7: op = _.ops.pop(); _.trys.pop(); continue; + default: + if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; } + if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; } + if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; } + if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; } + if (t[2]) _.ops.pop(); + _.trys.pop(); continue; + } + op = body.call(thisArg, _); + } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; } + if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true }; + } +}; +Object.defineProperty(exports, "__esModule", { value: true }); +var path_1 = require("path"); +var markdown_1 = require("./lib/markdown"); +var markdownParser_1 = require("./lib/markdownParser"); +var supabase_1 = require("./lib/supabase"); +var createEmbeddings = function () { return __awaiter(void 0, void 0, void 0, function () { + var startFolder, documents, _i, _a, _b, key, value, sections; + return __generator(this, function (_c) { + startFolder = (0, path_1.join)(__dirname, '..', 'src', 'app', '(private)', '(docs)', 'docs'); + documents = (0, markdown_1.default)(startFolder); + for (_i = 0, _a = Object.entries(documents); _i < _a.length; _i++) { + _b = _a[_i], key = _b[0], value = _b[1]; + console.log('Processing: ', key); + sections = (0, markdownParser_1.default)({ path: key, content: value }); + (0, supabase_1.default)(sections); + } + return [2 /*return*/]; + }); +}); }; +createEmbeddings(); diff --git a/rag/main.ts b/rag/main.ts new file mode 100644 index 0000000..7fa8522 --- /dev/null +++ b/rag/main.ts @@ -0,0 +1,20 @@ +import { join } from "path"; +import readMarkdownFiles from "./lib/markdown"; +import parseMDFile from "./lib/markdownParser"; +import uploadSections from "./lib/supabase"; + + +const createEmbeddings = async () => { + const startFolder = join(__dirname, '..', 'src', 'app', '(private)', '(docs)', 'docs'); + + const documents = readMarkdownFiles(startFolder); + + for (const [key, value] of Object.entries(documents)) { + console.log('Processing: ', key); + const sections = parseMDFile({ path: key, content: value }); + await uploadSections(sections); + } +}; + + +createEmbeddings(); \ No newline at end of file diff --git a/rag/query.js b/rag/query.js new file mode 100644 index 0000000..5ca826f --- /dev/null +++ b/rag/query.js @@ -0,0 +1,78 @@ +"use strict"; +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +var __generator = (this && this.__generator) || function (thisArg, body) { + var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g; + return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g; + function verb(n) { return function (v) { return step([n, v]); }; } + function step(op) { + if (f) throw new TypeError("Generator is already executing."); + while (g && (g = 0, op[0] && (_ = 0)), _) try { + if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t; + if (y = 0, t) op = [op[0] & 2, t.value]; + switch (op[0]) { + case 0: case 1: t = op; break; + case 4: _.label++; return { value: op[1], done: false }; + case 5: _.label++; y = op[1]; op = [0]; continue; + case 7: op = _.ops.pop(); _.trys.pop(); continue; + default: + if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; } + if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; } + if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; } + if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; } + if (t[2]) _.ops.pop(); + _.trys.pop(); continue; + } + op = body.call(thisArg, _); + } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; } + if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true }; + } +}; +Object.defineProperty(exports, "__esModule", { value: true }); +var client_1 = require("./lib/client"); +var openAI_1 = require("./lib/openAI"); +var query = function () { return __awaiter(void 0, void 0, void 0, function () { + var prompt, embeddings, documents, contextText, i, document_1, content, LLMPrompt, completion; + return __generator(this, function (_a) { + switch (_a.label) { + case 0: + prompt = 'Hvordan kan jeg finne ut mer om Pablo Escobar?'; + return [4 /*yield*/, (0, openAI_1.default)(prompt)]; + case 1: + embeddings = _a.sent(); + return [4 /*yield*/, client_1.default.rpc('match_documents_filter', { + query_embedding: embeddings, + match_threshold: 0.5, + match_count: 5, + filter_tag: 'docs' + })]; + case 2: + documents = (_a.sent()).data; + contextText = ''; + for (i = 0; i < documents.length; i++) { + document_1 = documents[i]; + content = document_1.content; + contextText += "".concat(content.trim(), "\n---\n"); + } + LLMPrompt = "\n Du er en veldig engasjert representant for Codex som elsker \u00E5 hjelpe utviklere! Gitt f\u00F8lgende instrukser fra Codex sin dokumentasjon, svar p\u00E5 sp\u00F8rsm\u00E5l med kun gitt informasjon. Hvis du er usikker og svaret ikke ligger i dokumentasjonen, svar \"Beklager, jeg vet ikke hvordan jeg skal hjelpe med det.\".\n\n Instrukser: ".concat(contextText, "\n\n Sp\u00F8rsm\u00E5l: \"\"\"\n ").concat(prompt, "\n \"\"\"\n\n Svar p\u00E5 norsk og i markdown-format, med relevante kodeblokker hvis det er n\u00F8dvendig.\n "); + return [4 /*yield*/, client_1.openaiClient.chat.completions.create({ + messages: [ + { role: 'system', content: LLMPrompt }, + ], + model: 'gpt-4o-mini' + })]; + case 3: + completion = _a.sent(); + console.log(completion.choices[0]); + return [2 /*return*/]; + } + }); +}); }; +query(); diff --git a/rag/query.ts b/rag/query.ts new file mode 100644 index 0000000..4b6fcf9 --- /dev/null +++ b/rag/query.ts @@ -0,0 +1,49 @@ +import supaBaseClient, { openaiClient } from "./lib/client"; +import embeddContent from "./lib/openAI"; + + +const query = async () => { + const prompt = 'Hvordan kan jeg finne ut mer om Pablo Escobar?'; + + const embeddings = await embeddContent(prompt); + + const { data: documents } = await supaBaseClient.rpc('match_documents_filter', { + query_embedding: embeddings, + match_threshold: 0.5, + match_count: 5, + filter_tag: 'docs' + }); + + let contextText = '' + + for (let i = 0; i < documents.length; i++) { + const document = documents[i] + const content = document.content + + contextText += `${content.trim()}\n---\n` + } + + const LLMPrompt = ` + Du er en veldig engasjert representant for Codex som elsker å hjelpe utviklere! Gitt følgende instrukser fra Codex sin dokumentasjon, svar på spørsmål med kun gitt informasjon. Hvis du er usikker og svaret ikke ligger i dokumentasjonen, svar "Beklager, jeg vet ikke hvordan jeg skal hjelpe med det.". + + Instrukser: ${contextText} + + Spørsmål: """ + ${prompt} + """ + + Svar på norsk og i markdown-format, med relevante kodeblokker hvis det er nødvendig. + `; + + const completion = await openaiClient.chat.completions.create({ + messages: [ + { role: 'system', content: LLMPrompt }, + ], + model: 'gpt-4o-mini' + }); + + console.log(completion.choices[0]); +} + + +query(); \ No newline at end of file diff --git a/tsconfig.json b/tsconfig.json index b54bf8b..1c134a2 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -1,6 +1,6 @@ { "compilerOptions": { - "target": "es6", + "target": "ES2020", "lib": ["dom", "dom.iterable", "esnext"], "allowJs": true, "skipLibCheck": true, From d3f167eca9acf37f72834ceda9077809ccc934ae Mon Sep 17 00:00:00 2001 From: Mads Nylund Date: Mon, 12 Aug 2024 19:13:49 +0200 Subject: [PATCH 2/2] stuff --- rag/lib/splitter.js | 36 +++++++++++++++++++++++++++++--- rag/lib/splitter.ts | 50 ++++++++++++++++++++++++++++++++++++++------- rag/main.js | 36 +++++++++++++++++++++++--------- rag/main.ts | 2 +- rag/query.js | 21 +++++-------------- rag/query.ts | 9 +++++--- 6 files changed, 114 insertions(+), 40 deletions(-) diff --git a/rag/lib/splitter.js b/rag/lib/splitter.js index ebed483..056adc8 100644 --- a/rag/lib/splitter.js +++ b/rag/lib/splitter.js @@ -1,16 +1,46 @@ "use strict"; Object.defineProperty(exports, "__esModule", { value: true }); var splitContent = function (content, tokenSize, overlap) { + // const text = content.replace(/\n/g, ' '); + // const words = text.split(/\s+/); + // const sections: string[] = []; if (tokenSize === void 0) { tokenSize = 200; } if (overlap === void 0) { overlap = 50; } + // for (let i = 0; i < words.length; i += (tokenSize - overlap)) { + // const section = words.slice(i, i + tokenSize).join(' '); + // sections.push(section); + // // Break if the next section would be less than tokenSize - overlap in size + // if (i + tokenSize >= words.length) { + // break; + // } + // } + // return sections; var text = content.replace(/\n/g, ' '); var words = text.split(/\s+/); var sections = []; + var inCodeBlock = false; for (var i = 0; i < words.length; i += (tokenSize - overlap)) { - var section = words.slice(i, i + tokenSize).join(' '); - sections.push(section); + var sectionWords = []; + var sectionLength = 0; + var j = i; + while (j < words.length && sectionLength < tokenSize) { + var word = words[j]; + sectionWords.push(word); + sectionLength++; + if (word.includes('```')) { + inCodeBlock = !inCodeBlock; + } + if (sectionLength >= tokenSize && inCodeBlock) { + sectionLength--; // Continue adding words if in code block + } + else if (sectionLength >= tokenSize && !inCodeBlock) { + break; + } + j++; + } + sections.push(sectionWords.join(' ')); // Break if the next section would be less than tokenSize - overlap in size - if (i + tokenSize >= words.length) { + if (j + tokenSize >= words.length) { break; } } diff --git a/rag/lib/splitter.ts b/rag/lib/splitter.ts index c7688bf..d2e46df 100644 --- a/rag/lib/splitter.ts +++ b/rag/lib/splitter.ts @@ -1,20 +1,56 @@ const splitContent = (content: string, tokenSize: number = 200, overlap: number = 50): string[] => { + // const text = content.replace(/\n/g, ' '); + // const words = text.split(/\s+/); + // const sections: string[] = []; + + // for (let i = 0; i < words.length; i += (tokenSize - overlap)) { + // const section = words.slice(i, i + tokenSize).join(' '); + // sections.push(section); + + // // Break if the next section would be less than tokenSize - overlap in size + // if (i + tokenSize >= words.length) { + // break; + // } + // } + + // return sections; const text = content.replace(/\n/g, ' '); const words = text.split(/\s+/); const sections: string[] = []; + let inCodeBlock = false; for (let i = 0; i < words.length; i += (tokenSize - overlap)) { - const section = words.slice(i, i + tokenSize).join(' '); - sections.push(section); - + let sectionWords = []; + let sectionLength = 0; + let j = i; + + while (j < words.length && sectionLength < tokenSize) { + const word = words[j]; + sectionWords.push(word); + sectionLength++; + + if (word.includes('```')) { + inCodeBlock = !inCodeBlock; + } + + if (sectionLength >= tokenSize && inCodeBlock) { + sectionLength--; // Continue adding words if in code block + } else if (sectionLength >= tokenSize && !inCodeBlock) { + break; + } + j++; + } + + sections.push(sectionWords.join(' ')); + // Break if the next section would be less than tokenSize - overlap in size - if (i + tokenSize >= words.length) { - break; + if (j + tokenSize >= words.length) { + break; } - } - + } + return sections; }; diff --git a/rag/main.js b/rag/main.js index e0f7b3a..ba5e384 100644 --- a/rag/main.js +++ b/rag/main.js @@ -41,17 +41,33 @@ var markdown_1 = require("./lib/markdown"); var markdownParser_1 = require("./lib/markdownParser"); var supabase_1 = require("./lib/supabase"); var createEmbeddings = function () { return __awaiter(void 0, void 0, void 0, function () { - var startFolder, documents, _i, _a, _b, key, value, sections; - return __generator(this, function (_c) { - startFolder = (0, path_1.join)(__dirname, '..', 'src', 'app', '(private)', '(docs)', 'docs'); - documents = (0, markdown_1.default)(startFolder); - for (_i = 0, _a = Object.entries(documents); _i < _a.length; _i++) { - _b = _a[_i], key = _b[0], value = _b[1]; - console.log('Processing: ', key); - sections = (0, markdownParser_1.default)({ path: key, content: value }); - (0, supabase_1.default)(sections); + var startFolder, documents, _i, _a, _b, key, value, sections, _c; + return __generator(this, function (_d) { + switch (_d.label) { + case 0: + startFolder = (0, path_1.join)(__dirname, '..', 'src', 'app', '(private)', '(docs)', 'docs'); + documents = (0, markdown_1.default)(startFolder); + _i = 0, _a = Object.entries(documents); + _d.label = 1; + case 1: + if (!(_i < _a.length)) return [3 /*break*/, 5]; + _b = _a[_i], key = _b[0], value = _b[1]; + console.log('Processing: ', key); + sections = (0, markdownParser_1.default)({ path: key, content: value }); + _c = sections.length; + if (!_c) return [3 /*break*/, 3]; + return [4 /*yield*/, (0, supabase_1.default)(sections)]; + case 2: + _c = (_d.sent()); + _d.label = 3; + case 3: + _c; + _d.label = 4; + case 4: + _i++; + return [3 /*break*/, 1]; + case 5: return [2 /*return*/]; } - return [2 /*return*/]; }); }); }; createEmbeddings(); diff --git a/rag/main.ts b/rag/main.ts index 7fa8522..d33f862 100644 --- a/rag/main.ts +++ b/rag/main.ts @@ -12,7 +12,7 @@ const createEmbeddings = async () => { for (const [key, value] of Object.entries(documents)) { console.log('Processing: ', key); const sections = parseMDFile({ path: key, content: value }); - await uploadSections(sections); + sections.length && await uploadSections(sections); } }; diff --git a/rag/query.js b/rag/query.js index 5ca826f..1104100 100644 --- a/rag/query.js +++ b/rag/query.js @@ -43,31 +43,20 @@ var query = function () { return __awaiter(void 0, void 0, void 0, function () { return __generator(this, function (_a) { switch (_a.label) { case 0: - prompt = 'Hvordan kan jeg finne ut mer om Pablo Escobar?'; + prompt = 'Hvordan kan jeg sette opp en modell med aksesskontroll?'; return [4 /*yield*/, (0, openAI_1.default)(prompt)]; case 1: embeddings = _a.sent(); - return [4 /*yield*/, client_1.default.rpc('match_documents_filter', { + return [4 /*yield*/, client_1.default.rpc('match_documents', { query_embedding: embeddings, match_threshold: 0.5, match_count: 5, - filter_tag: 'docs' + // filter_tag: 'docs' })]; case 2: documents = (_a.sent()).data; - contextText = ''; - for (i = 0; i < documents.length; i++) { - document_1 = documents[i]; - content = document_1.content; - contextText += "".concat(content.trim(), "\n---\n"); - } - LLMPrompt = "\n Du er en veldig engasjert representant for Codex som elsker \u00E5 hjelpe utviklere! Gitt f\u00F8lgende instrukser fra Codex sin dokumentasjon, svar p\u00E5 sp\u00F8rsm\u00E5l med kun gitt informasjon. Hvis du er usikker og svaret ikke ligger i dokumentasjonen, svar \"Beklager, jeg vet ikke hvordan jeg skal hjelpe med det.\".\n\n Instrukser: ".concat(contextText, "\n\n Sp\u00F8rsm\u00E5l: \"\"\"\n ").concat(prompt, "\n \"\"\"\n\n Svar p\u00E5 norsk og i markdown-format, med relevante kodeblokker hvis det er n\u00F8dvendig.\n "); - return [4 /*yield*/, client_1.openaiClient.chat.completions.create({ - messages: [ - { role: 'system', content: LLMPrompt }, - ], - model: 'gpt-4o-mini' - })]; + console.log(documents); + return [2 /*return*/]; case 3: completion = _a.sent(); console.log(completion.choices[0]); diff --git a/rag/query.ts b/rag/query.ts index 4b6fcf9..6b733ca 100644 --- a/rag/query.ts +++ b/rag/query.ts @@ -3,17 +3,20 @@ import embeddContent from "./lib/openAI"; const query = async () => { - const prompt = 'Hvordan kan jeg finne ut mer om Pablo Escobar?'; + const prompt = 'Hvordan kan jeg sette opp en modell med aksesskontroll?'; const embeddings = await embeddContent(prompt); - const { data: documents } = await supaBaseClient.rpc('match_documents_filter', { + const { data: documents } = await supaBaseClient.rpc('match_documents', { query_embedding: embeddings, match_threshold: 0.5, match_count: 5, - filter_tag: 'docs' + // filter_tag: 'docs' }); + console.log(documents) + return + let contextText = '' for (let i = 0; i < documents.length; i++) {