diff --git a/README.md b/README.md index 56c0d24..dd30952 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,7 @@ S3_HOST_BUCKET=%(bucket)s.s3.amazonaws.com # Open AI (Aliyun) OPENAI_API_BASE=https://dashscope.aliyuncs.com/compatible-mode/v1 OPENAI_API_KEY=******** +OPENAI_MODEL=qwen-turbo ``` - Generate a [personal access token](https://github.com/settings/tokens) from GitHub and fill the `GITHUB_ACCESS_TOKEN` diff --git a/frontend/src/layouts/MainLayout.vue b/frontend/src/layouts/MainLayout.vue index 84f321b..4e2b194 100644 --- a/frontend/src/layouts/MainLayout.vue +++ b/frontend/src/layouts/MainLayout.vue @@ -85,6 +85,7 @@ export default { drawer: false, topics: [ "all", + "ai", "python", "java", "cpp", diff --git a/utils.py b/utils.py index 0441ea7..bc69499 100644 --- a/utils.py +++ b/utils.py @@ -14,6 +14,7 @@ from sqlalchemy import Column, String, Integer, DateTime, JSON from sqlalchemy.orm import declarative_base from openai import OpenAI +from pydantic import BaseModel MAX_COMMENT_LENGTH = 512 @@ -22,6 +23,9 @@ api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_API_BASE") ) +# OpenAI model for chat completions (default: qwen-turbo) +OPENAI_MODEL = os.getenv("OPENAI_MODEL", "qwen-turbo") + class LogFormatter(logging.Formatter): @@ -175,7 +179,7 @@ def tldr(text: str) -> str: + f"The README of the repository is: \n\n{text}" ) resp = openai_client.chat.completions.create( - model="qwen-turbo", + model=OPENAI_MODEL, messages=[ { "role": "user", @@ -186,6 +190,50 @@ def tldr(text: str) -> str: return resp.choices[0].message.content +class AIRelevance(BaseModel): + """Model for AI relevance detection.""" + + is_ai_related: bool + + +def isai(text: str) -> bool: + """ + Determine if a repository is related to AI based on its description. + + Args: + text: The repository description or README content. + + Returns: + True if the repository is AI-related, False otherwise. + """ + prompt = ( + "Determine if this GitHub repository is related to Artificial Intelligence (AI), " + "Large Language Models (LLMs), Vision Language Model (VLM), World Model, " + "Retrieval-Augmented Generation (RAG), Vector Database, Embedding, Agent," + "Vibe Coding, Harness Engineering, or other AI fields. " + "Consider libraries, frameworks, models, and tools for AI development.\n\n" + f"Repository description/README:\n{text}" + ) + + resp = openai_client.beta.chat.completions.parse( + model=OPENAI_MODEL, + messages=[ + { + "role": "user", + "content": prompt, + } + ], + response_format=AIRelevance, + extra_body={ + "chat_template_kwargs": { + "enable_thinking": False, + } + }, + ) + + return resp.choices[0].message.parsed.is_ai_related + + def get_repo_info(github_client: Github, full_name: str) -> Optional[Dict]: """ Get GitHub repository information. @@ -204,6 +252,14 @@ def get_repo_info(github_client: Github, full_name: str) -> Optional[Dict]: if description is None: description = tldr(repo.get_readme().decoded_content.decode("utf-8")) print("QWEN:", description) + + # Check if repository is AI-related and add "ai" category + if isai(description): + if categories is None: + categories = ["ai"] + elif "ai" not in categories: + categories.append("ai") + description_embedding = embedding(description) item = { "ItemId": full_name.replace("/", ":").lower(), diff --git a/x.py b/x.py index 2d6577d..51922cc 100644 --- a/x.py +++ b/x.py @@ -226,8 +226,6 @@ def upgrade_embedding(): cursor = "" while True: items, cursor = gorse_client.get_items(1000, cursor) - if cursor == "": - break for item in tqdm(items): if len(item["Comment"]) > 0: item["Labels"]["embedding"] = embedding(item["Comment"]) @@ -235,6 +233,9 @@ def upgrade_embedding(): item["ItemId"], labels=item["Labels"], ) + + if cursor == "": + break def write_dump(f, data: message.Message): @@ -317,5 +318,46 @@ def dump_playground(database: str, username: Optional[str], password: Optional[s f"Dump complete: {num_users} users, {num_items} items, {num_feedback} feedback.") + +@command.command() +def upgrade_ai(): + """Upgrade items with AI category detection.""" + cursor = "" + updated_count = 0 + while True: + items, cursor = gorse_client.get_items(1000, cursor) + for item in tqdm(items): + item_id = item["ItemId"] + categories = item.get("Categories") or [] + + # Skip if already has "ai" category + if "ai" in categories: + continue + + # Get description from comment + description = item.get("Comment", "") + if not description: + continue + + # Check if AI-related + try: + if isai(description): + categories.append("ai") + gorse_client.update_item( + item_id, + categories=categories, + ) + updated_count += 1 + print(f"UPDATE {item_id} -> ai") + except Exception as e: + print(f"FAIL {item_id}: {e}") + continue + + if cursor == "": + break + + print(f"Upgrade complete: {updated_count} items updated with 'ai' category.") + + if __name__ == "__main__": command()