Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ export JINA_API_KEY=your-key-here
| `jina search QUERY` | Web search (also --arxiv, --ssrn, --images, --blog) |
| `jina embed TEXT` | Generate embeddings |
| `jina rerank QUERY` | Rerank documents from stdin by relevance |
| `jina classify TEXT` | Classify text into labels |
| `jina dedup` | Deduplicate text from stdin |
| `jina screenshot URL` | Capture screenshot of a URL |
| `jina bibtex QUERY` | Search BibTeX citations (DBLP + Semantic Scholar) |
Expand Down Expand Up @@ -86,7 +87,7 @@ jina search "LLMs" --gl us --hl en # US, English
jina embed "hello world"
jina embed "text1" "text2" "text3"
cat texts.txt | jina embed
jina embed "hello" --model jina-embeddings-v3 --task retrieval.query
jina embed "hello" --model jina-embeddings-v5-text-small --task retrieval.query
```

### Rerank
Expand All @@ -96,6 +97,14 @@ cat docs.txt | jina rerank "machine learning"
jina search "AI" | jina rerank "embeddings" --top-n 5
```

### Classify

```bash
jina classify "I love this product" --labels positive,negative,neutral
echo "stock prices rose sharply" | jina classify --labels business,sports,tech
cat texts.txt | jina classify --labels cat1,cat2,cat3 --json
```

### Deduplicate

```bash
Expand Down Expand Up @@ -195,7 +204,7 @@ jina grep serve stop # stop when done

## Local mode

`jina embed` and `jina rerank` support `--local` to run on Apple Silicon via the jina-grep embedding server instead of the Jina API. No API key needed.
`jina embed`, `jina rerank`, and `jina dedup` support `--local` to run on Apple Silicon via the jina-grep embedding server instead of the Jina API. No API key needed.

```bash
# Start the local server first
Expand All @@ -207,6 +216,9 @@ cat texts.txt | jina embed --local --json

# Local reranking (cosine similarity on local embeddings)
cat docs.txt | jina rerank --local "machine learning"

# Local deduplication
cat items.txt | jina dedup --local
```

Local mode uses `jina-embeddings-v5-nano` by default. Override with `--model jina-embeddings-v5-small`.
Expand Down
109 changes: 81 additions & 28 deletions jina_cli/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ def expand_query(
def embed(
texts: list[str],
api_key: str | None = None,
model: str = "jina-embeddings-v3",
model: str = "jina-embeddings-v5-text-small",
task: str = "text-matching",
dimensions: int | None = None,
late_chunking: bool = False,
Expand Down Expand Up @@ -385,6 +385,37 @@ def embed(
return data.get("data", [])


# -- Classify API --


def classify(
texts: list[str],
labels: list[str],
api_key: str | None = None,
model: str = "jina-embeddings-v5-text-small",
) -> list[dict]:
"""Classify texts into labels using Jina classify API."""
key = require_api_key(api_key)
headers = {
"Content-Type": "application/json",
**_auth_headers(key),
}

body: dict = {
"model": model,
"input": texts,
"labels": labels,
}

with _client() as client:
resp = _request_with_retry(
"POST", f"{API_BASE}/v1/classify",
client, headers=headers, json=body,
)
data = resp.json()
return data.get("data", [])


# -- Local Embeddings (via jina-grep server) --


Expand Down Expand Up @@ -493,32 +524,12 @@ def _cosine_similarity(a: list[float], b: list[float]) -> float:
return dot / (norm_a * norm_b)


def deduplicate(
def _deduplicate_from_embeddings(
strings: list[str],
api_key: str | None = None,
embeddings: list[list[float]],
k: int | None = None,
) -> list[dict]:
"""Deduplicate strings using embeddings + greedy selection.

Uses facility-location submodular optimization:
greedily selects items that maximize coverage diversity.
"""
if not strings:
return []
if len(strings) == 1:
return [{"index": 0, "text": strings[0]}]

key = require_api_key(api_key)

# Get embeddings (v5-text-small is faster and sufficient for dedup)
embeddings_data = embed(
strings,
api_key=key,
model="jina-embeddings-v5-text-small",
task="text-matching",
)
embeddings = [item["embedding"] for item in embeddings_data]

"""Core dedup logic: given embeddings, run submodular selection."""
n = len(embeddings)

# Compute similarity matrix
Expand All @@ -530,13 +541,11 @@ def deduplicate(
sim[j][i] = s

# Lazy greedy submodular selection
threshold = 1e-2
if k is None:
# Auto-detect: keep adding until marginal gain drops below threshold
threshold = 1e-2
k = n

selected: list[int] = []
# Track max similarity of each item to any selected item (facility-location)
coverage = [0.0] * n

for _ in range(min(k, n)):
Expand All @@ -546,7 +555,6 @@ def deduplicate(
for i in range(n):
if i in selected:
continue
# Marginal gain: how much does adding i improve coverage?
gain = 0.0
for j in range(n):
new_cov = max(coverage[j], sim[j][i])
Expand All @@ -566,6 +574,51 @@ def deduplicate(
return [{"index": i, "text": strings[i]} for i in selected]


def deduplicate(
strings: list[str],
api_key: str | None = None,
k: int | None = None,
) -> list[dict]:
"""Deduplicate strings using embeddings + greedy selection.

Uses facility-location submodular optimization:
greedily selects items that maximize coverage diversity.
"""
if not strings:
return []
if len(strings) == 1:
return [{"index": 0, "text": strings[0]}]

key = require_api_key(api_key)

embeddings_data = embed(
strings,
api_key=key,
model="jina-embeddings-v5-text-small",
task="text-matching",
)
embeddings = [item["embedding"] for item in embeddings_data]

return _deduplicate_from_embeddings(strings, embeddings, k=k)


def local_deduplicate(
strings: list[str],
model: str = "jina-embeddings-v5-nano",
k: int | None = None,
) -> list[dict]:
"""Deduplicate strings using local embeddings + greedy selection."""
if not strings:
return []
if len(strings) == 1:
return [{"index": 0, "text": strings[0]}]

embeddings_data = local_embed(strings, model=model, task="text-matching")
embeddings = [item["embedding"] for item in embeddings_data]

return _deduplicate_from_embeddings(strings, embeddings, k=k)


# -- BibTeX Search --


Expand Down
71 changes: 67 additions & 4 deletions jina_cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
jina search QUERY Web search
jina embed TEXT Generate embeddings
jina rerank QUERY Rerank documents by relevance
jina classify TEXT Classify text into labels
jina dedup Deduplicate text lines from stdin
jina screenshot URL Capture screenshot of a URL
jina bibtex QUERY Search for BibTeX entries
Expand Down Expand Up @@ -111,6 +112,7 @@ def cli(ctx, api_key):
" jina search QUERY Web search (also --arxiv, --ssrn, --images, --blog)\n"
" jina embed TEXT Generate embeddings\n"
" jina rerank QUERY Rerank documents from stdin by relevance\n"
" jina classify TEXT Classify text into labels\n"
" jina dedup Deduplicate text from stdin\n"
" jina screenshot URL Capture screenshot of a URL\n"
" jina bibtex QUERY Search BibTeX citations\n"
Expand Down Expand Up @@ -266,7 +268,7 @@ def search(ctx, query, arxiv, ssrn, images, blog, num, tbs, location, gl, hl, as

@cli.command()
@click.argument("text", nargs=-1)
@click.option("--model", default=None, help="Model name (default: jina-embeddings-v3, or v5-nano with --local)")
@click.option("--model", default=None, help="Model name (default: jina-embeddings-v5-text-small, or v5-nano with --local)")
@click.option("--task", default=None, help="Embedding task type")
@click.option("--dimensions", type=int, default=None, help="Output dimensions (Matryoshka)")
@click.option("--local", is_flag=True, help="Use local MLX server (requires: jina-grep serve start)")
Expand Down Expand Up @@ -308,7 +310,7 @@ def embed(ctx, text, model, task, dimensions, local, as_json, api_key):
_task = task or "text-matching"
result = api.local_embed(texts, model=_model, task=_task)
else:
_model = model or "jina-embeddings-v3"
_model = model or "jina-embeddings-v5-text-small"
_task = task or "text-matching"
result = api.embed(texts, api_key=key, model=_model, task=_task, dimensions=dimensions)
click.echo(utils.format_embeddings(result, as_json=as_json))
Expand Down Expand Up @@ -366,10 +368,11 @@ def rerank(ctx, query, top_n, model, local, as_json, api_key):

@cli.command()
@click.option("-k", type=int, default=None, help="Number of unique items to keep (auto if not set)")
@click.option("--local", is_flag=True, help="Use local MLX server (requires: jina-grep serve start)")
@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
@click.option("--api-key", default=None, help="Jina API key")
@click.pass_context
def dedup(ctx, k, as_json, api_key):
def dedup(ctx, k, local, as_json, api_key):
"""Deduplicate text lines from stdin.

Uses embeddings to find semantically unique items.
Expand All @@ -378,6 +381,7 @@ def dedup(ctx, k, as_json, api_key):
Examples:
cat items.txt | jina dedup
jina search "AI" | jina dedup -k 5
cat items.txt | jina dedup --local
"""
key = api_key or ctx.obj.get("api_key")
lines = utils.read_stdin_lines()
Expand All @@ -390,12 +394,71 @@ def dedup(ctx, k, as_json, api_key):
sys.exit(EXIT_USER_ERROR)

try:
result = api.deduplicate(lines, api_key=key, k=k)
if local:
result = api.local_deduplicate(lines, k=k)
else:
result = api.deduplicate(lines, api_key=key, k=k)
click.echo(utils.format_dedup_results(result, as_json=as_json))
except Exception as e:
utils.handle_http_error(e)


# -- classify --


@cli.command()
@click.argument("text", nargs=-1)
@click.option("--labels", required=True, multiple=True,
help="Labels for classification (comma-separated or repeated --labels)")
@click.option("--model", default=None, help="Model name (default: jina-embeddings-v5-text-small)")
@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
@click.option("--api-key", default=None, help="Jina API key")
@click.pass_context
def classify(ctx, text, labels, model, as_json, api_key):
"""Classify text into labels.

Input from arguments or stdin (one text per line).

\b
Examples:
jina classify "this is great" --labels positive,negative
echo "stock price rose" | jina classify --labels business,sports,tech
jina classify "text1" "text2" --labels cat1 --labels cat2 --labels cat3
"""
key = api_key or ctx.obj.get("api_key")

texts = list(text)
if not texts:
stdin_lines = utils.read_stdin_lines()
texts = stdin_lines

if not texts:
_short_usage(
"Usage: jina classify TEXT --labels label1,label2",
["jina classify \"this is great\" --labels positive,negative",
"echo \"text\" | jina classify --labels label1,label2",
"cat texts.txt | jina classify --labels a,b,c --json"],
)

# Parse labels: support both --labels a,b,c and --labels a --labels b
parsed_labels = []
for lbl in labels:
parsed_labels.extend(l.strip() for l in lbl.split(",") if l.strip())

if not parsed_labels:
click.echo("Error: at least one label required.\n"
"Fix: --labels positive,negative", err=True)
sys.exit(EXIT_USER_ERROR)

_model = model or "jina-embeddings-v5-text-small"

try:
result = api.classify(texts, parsed_labels, api_key=key, model=_model)
click.echo(utils.format_classify_results(result, as_json=as_json))
except Exception as e:
utils.handle_http_error(e)


# -- screenshot --


Expand Down
26 changes: 26 additions & 0 deletions jina_cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,32 @@ def format_embeddings(data: list[dict], as_json: bool = False) -> str:
return "\n".join(lines)


def format_classify_results(results: list[dict], as_json: bool = False) -> str:
"""Format classification results for display."""
if as_json:
return json.dumps(results, indent=2, ensure_ascii=False)

lines = []
for item in results:
predictions = item.get("prediction", item.get("predictions", []))
if isinstance(predictions, str):
# Single prediction
score = item.get("score", item.get("confidence", 0))
lines.append(f"{predictions} ({score:.4f})")
elif isinstance(predictions, list) and predictions:
# List of predictions - take top one
top = predictions[0]
if isinstance(top, dict):
label = top.get("label", "")
score = top.get("score", top.get("confidence", 0))
lines.append(f"{label} ({score:.4f})")
else:
lines.append(str(top))
else:
lines.append(str(item))
return "\n".join(lines)


def format_dedup_results(results: list[dict], as_json: bool = False) -> str:
"""Format deduplication results for display."""
if as_json:
Expand Down
Loading
Loading