Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 11 additions & 16 deletions DEMO/custom_preprocessing_and_postprocessing_hooks.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"\n",
"- Remove punctuation from input queries before the VectorStore search process begins,\n",
"- Capitalising all text in an input query to the Vectorstore search process,\n",
"- Deduplicate results based on the doc_id column so that duplicate knowledgebase entries are not returned,\n",
"- Deduplicate results based on the doc_label column so that duplicate knowledgebase entries are not returned,\n",
"- Prevent users of the package from retrieving certain documents in your vectorstore,\n",
"- Removing hate speech from any input text.\n",
"\n",
Expand Down Expand Up @@ -51,7 +51,7 @@
" - Takes in a body of text and searches the vector store for semantically similar knowledgebase samples.\n",
"\n",
"2. **`reverse_search()`** \n",
" - Takes in document IDs and searches the vector store for entries with those IDs.\n",
" - Takes in document labels and searches the vector store for entries with those labels.\n",
"\n",
"3. **`embed()`** \n",
" - Takes in a body of text and uses the vectoriser model to convert the text into embeddings.\n",
Expand All @@ -66,7 +66,7 @@
"\n",
"This shows that the `VectorStore.search()` method expects:\n",
"- An **input dataclass object** with columns `[id, query]`. \n",
"- To output an **output dataclass object** with columns `[query_id, query_text, doc_id, doc_text, rank, score]`.\n",
"- To output an **output dataclass object** with columns `[query_id, query_text, doc_label, doc_text, rank, score]`.\n",
"\n",
"The use of these dataclasses both helps the user of the package to understand what data needs to be provided to the Vectorstore and how a user should interact with the objects being returned by these VectorStore functions. Additionally, this ensures robustness of the package by checking that the correct columns are present in the data before operating on it. \n",
"\n",
Expand Down Expand Up @@ -217,7 +217,7 @@
"source": [
"The below code uses our dataclasses to set up some data to pass to the VectorStore search method, notice that:\n",
" * an exclaimation mark in the query (that in some cases we may want to sanitise) is shown in the results. \n",
" * Also the results for the below query should also show several rows with the same ```'doc_id'``` value (because our example data file had multiple entries with the same id label)"
" * Also the results for the below query should also show several rows with the same ```'doc_label'``` value (because our example data file had multiple entries with the same label value)"
]
},
{
Expand Down Expand Up @@ -277,8 +277,8 @@
"\n",
"def drop_duplicates(input_data: VectorStoreSearchOutput) -> VectorStoreSearchOutput:\n",
" # we want to depuplicate the ranking attribute of the pydantic model which is a pandas dataframe\n",
" # specifically we want to drop all but the first occurrence of each unique 'doc_id' value for each subset of query results\n",
" input_data = input_data.drop_duplicates(subset=[\"query_id\", \"doc_id\"], keep=\"first\")\n",
" # specifically we want to drop all but the first occurrence of each unique 'doc_label' value for each subset of query results\n",
" input_data = input_data.drop_duplicates(subset=[\"query_id\", \"doc_label\"], keep=\"first\")\n",
"\n",
" # BE CAREFUL: drop_duplicates returns an object of type DataFrame, not VectorStoreSearchOutput so we need to convert back to that type after this operation\n",
" input_data = VectorStoreSearchOutput(input_data)\n",
Expand Down Expand Up @@ -380,8 +380,8 @@
"outputs": [],
"source": [
"def drop_duplicates_and_reset_rank(input_object: VectorStoreSearchOutput) -> VectorStoreSearchOutput:\n",
" # Remove duplicates based on 'query_id' and 'doc_id'\n",
" input_object = input_object.drop_duplicates(subset=[\"query_id\", \"doc_id\"], keep=\"first\")\n",
" # Remove duplicates based on 'query_id' and 'doc_label'\n",
" input_object = input_object.drop_duplicates(subset=[\"query_id\", \"doc_label\"], keep=\"first\")\n",
"\n",
" # Reset the rank column per query_id using .loc to avoid SettingWithCopyWarning\n",
" input_object.loc[:, \"rank\"] = input_object.groupby(\"query_id\").cumcount()\n",
Expand Down Expand Up @@ -507,7 +507,7 @@
"source": [
"### Injecting Data into our classification results with a hook\n",
"\n",
"What if we had some additional context information that we wanted to add in our pipeline. It could be some official taxonomy definitions about our doc_id labels, such as SIC or SOC code definitions.\n",
"What if we had some additional context information that we wanted to add in our pipeline. It could be some official taxonomy definitions about our doc_labels, such as SIC or SOC code definitions.\n",
"\n",
"We may want to inject this extra information that's not directly stored as metadata in the knowledgebase, so that a downstream component (such as a RAG agent) can use the additional information"
]
Expand Down Expand Up @@ -551,8 +551,8 @@
"outputs": [],
"source": [
"def add_id_definitions(input_data: VectorStoreSearchOutput) -> VectorStoreSearchOutput:\n",
" # Map the 'doc_id' column to the corresponding definitions from the dictionary\n",
" input_data.loc[:, \"id_definition\"] = input_data[\"doc_id\"].map(official_id_definitions)\n",
" # Map the 'doc_label' column to the corresponding definitions from the dictionary\n",
" input_data.loc[:, \"id_definition\"] = input_data[\"doc_label\"].map(official_id_definitions)\n",
"\n",
" return input_data"
]
Expand Down Expand Up @@ -661,11 +661,6 @@
" - try adding a new column of data to the reverse search results \n",
" - make it so that if the user tries to reverse search for a specific ID that is 'secret' then that row is removed from the input data."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
}
],
"metadata": {
Expand Down
9 changes: 1 addition & 8 deletions DEMO/custom_vectoriser.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@
"outputs": [],
"source": [
"# we're going to use scikit learns countvectoriser to create our one hot embeddings - install in the terminal or uncomment the below code\n",
"# !pip install scikit-learn"
"# !pip install scikit-learn OR uv pip install scikit-learn"
]
},
{
Expand Down Expand Up @@ -319,13 +319,6 @@
"we can create our own custom vectoriser such as the one-hot encoding model shown here. \n",
"Check out the other DEMO notebooks to see how use the Vectorstore and Vectorisers in other ways and how to deploy your search system over a RestAPI service :)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
2 changes: 1 addition & 1 deletion DEMO/data/fake_soc_dataset.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
id,text
label,text
101,"Fruit farmer: Grows and harvests fruits such as apples, oranges, and berries."
101,"Vegetable farmer: Cultivates and harvests vegetables like carrots, potatoes, and lettuce."
102,"Dairy farmer: Manages cows for milk production and processes dairy products."
Expand Down
2 changes: 1 addition & 1 deletion DEMO/data/testdata.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
id,text,colour,country,language
label,text,colour,country,language
0001,The sun rises in the east.,Orange,India,Hindi
0002,The moon shines at night.,White,USA,English
0003,Rivers flow to the sea.,Blue,Brazil,Portuguese
Expand Down
12 changes: 6 additions & 6 deletions DEMO/general_workflow_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@
"source": [
"from classifai.indexers.dataclasses import VectorStoreReverseSearchInput\n",
"\n",
"input_data_2 = VectorStoreReverseSearchInput({\"id\": [\"1\", \"2\"], \"doc_id\": [\"1100\", \"1056\"]})\n",
"input_data_2 = VectorStoreReverseSearchInput({\"id\": [\"1\", \"2\"], \"doc_label\": [\"1100\", \"1056\"]})\n",
"\n",
"my_vector_store.reverse_search(input_data_2)"
]
Expand All @@ -227,7 +227,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## With reverse search you can do partial matching!\n",
"### With reverse search you can do partial matching!\n",
"use the `partial match` flag to check if the **ids/labels** start with our query id"
]
},
Expand All @@ -237,7 +237,7 @@
"metadata": {},
"outputs": [],
"source": [
"input_data_3 = VectorStoreReverseSearchInput({\"id\": [\"1\", \"2\"], \"doc_id\": [\"1100\", \"105\"]})\n",
"input_data_3 = VectorStoreReverseSearchInput({\"id\": [\"1\", \"2\"], \"doc_label\": [\"1100\", \"105\"]})\n",
"\n",
"my_vector_store.reverse_search(input_data_3, partial_match=True)"
]
Expand All @@ -250,7 +250,7 @@
"source": [
"## use n_results to limit the amount of results per-item\n",
"\n",
"my_vector_store.reverse_search(input_data_3, n_results=2, partial_match=True)"
"my_vector_store.reverse_search(input_data_3, max_n_results=2, partial_match=True)"
]
},
{
Expand Down Expand Up @@ -445,7 +445,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "classifai",
"language": "python",
"name": "python3"
},
Expand All @@ -459,7 +459,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
"version": "3.13.7"
}
},
"nbformat": 4,
Expand Down
44 changes: 25 additions & 19 deletions src/classifai/indexers/dataclasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class VectorStoreSearchOutput(pd.DataFrame):
Attributes:
query_id (pd.Series): Identifier for the source query.
query_text (pd.Series): The original query text.
doc_id (pd.Series): Identifier for the retrieved document.
doc_label (pd.Series): Identifier for the retrieved document.
doc_text (pd.Series): The text content of the retrieved document.
rank (pd.Series): The ranking position of the result (0-indexed, non-negative).
score (pd.Series): The similarity score or relevance metric.
Expand All @@ -77,7 +77,7 @@ class VectorStoreSearchOutput(pd.DataFrame):
{
"query_id": pa.Column(str),
"query_text": pa.Column(str),
"doc_id": pa.Column(str),
"doc_label": pa.Column(str),
"doc_text": pa.Column(str),
"rank": pa.Column(int, pa.Check.ge(0)),
"score": pa.Column(float),
Expand Down Expand Up @@ -117,8 +117,8 @@ def query_text(self) -> pd.Series:
return self["query_text"]

@property
def doc_id(self) -> pd.Series:
return self["doc_id"]
def doc_label(self) -> pd.Series:
return self["doc_label"]

@property
def doc_text(self) -> pd.Series:
Expand All @@ -141,13 +141,13 @@ class VectorStoreReverseSearchInput(pd.DataFrame):

Attributes:
id (pd.Series): Unique identifier for the reverse search query.
doc_id (pd.Series): The document ID to find similar documents for.
doc_label (pd.Series): The document ID to find similar documents for.
"""

_schema = pa.DataFrameSchema(
{
"id": pa.Column(str),
"doc_id": pa.Column(str),
"doc_label": pa.Column(str),
},
coerce=True,
)
Expand Down Expand Up @@ -179,8 +179,8 @@ def id(self) -> pd.Series:
return self["id"]

@property
def text(self) -> pd.Series:
return self["doc_id"]
def doc_label(self) -> pd.Series:
return self["doc_label"]


class VectorStoreReverseSearchOutput(pd.DataFrame):
Expand All @@ -190,16 +190,18 @@ class VectorStoreReverseSearchOutput(pd.DataFrame):
containing knowledgebase examples with the same label as in the query.

Attributes:
query_id (pd.Series): Identifier for the input label for lookup in the knowledgebase.
doc_id (pd.Series): Identifier for the knowledgebase example retrieved.
doc_text (pd.Series): The text content of the retrieved example.
id (pd.Series): Identifier for the input label for lookup in the knowledgebase.
doc_label (pd.Series): Identifier for the knowledgebase example retrieved.
retrieved_doc_label (pd.Series): Identifier for the retrieved document with the same label.
retrieved_doc_text (pd.Series): The text content of the retrieved example.
"""

_schema = pa.DataFrameSchema(
{
"id": pa.Column(str),
"doc_id": pa.Column(str),
"doc_text": pa.Column(str),
"doc_label": pa.Column(str),
"retrieved_doc_label": pa.Column(str),
"retrieved_doc_text": pa.Column(str),
}
)

Expand All @@ -226,16 +228,20 @@ def validate(cls, df: pd.DataFrame) -> "VectorStoreReverseSearchOutput":
return cls(validated_df)

@property
def query_id(self) -> pd.Series:
return self["input_doc_id"]
def id(self) -> pd.Series:
return self["id"]

@property
def doc_id(self) -> pd.Series:
return self["retrieved_doc_id"]
def doc_label(self) -> pd.Series:
return self["doc_label"]

@property
def doc_text(self) -> pd.Series:
return self["doc_text"]
def retrieved_doc_label(self) -> pd.Series:
return self["retrieved_doc_label"]

@property
def retrieved_doc_text(self) -> pd.Series:
return self["retrived_doc_text"]


class VectorStoreEmbedInput(pd.DataFrame):
Expand Down
Loading
Loading