diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..8440e4d --- /dev/null +++ b/.dockerignore @@ -0,0 +1,12 @@ +.git +__pycache__ +*.pyc +*.md +.env* +.claude +.omc +.planning +tests +docs +deploy +*.log diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..ca7a797 --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,51 @@ +name: Deploy to Cloud Run + +on: + push: + branches: + - main + +env: + PROJECT_ID: claude-mcp-457317 + REGION: us-central1 + SERVICE_NAME: gmail-scraper + SERVICE_ACCOUNT: claude-service-account@claude-mcp-457317.iam.gserviceaccount.com + +jobs: + deploy: + runs-on: ubuntu-latest + permissions: + contents: read + id-token: write + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Google Auth + id: auth + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ secrets.WIF_PROVIDER }} + service_account: claude-service-account@claude-mcp-457317.iam.gserviceaccount.com + + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@v2 + + - name: Deploy to Cloud Run + uses: google-github-actions/deploy-cloudrun@v2 + with: + service: ${{ env.SERVICE_NAME }} + region: ${{ env.REGION }} + source: . + env_vars: | + PROJECT_ID=${{ env.PROJECT_ID }} + DATASET_ID=gmail_analytics + TABLE_ID=messages + ADMIN_EMAIL=avi@envsn.com + flags: '--allow-unauthenticated --service-account=${{ env.SERVICE_ACCOUNT }} --timeout=3600 --memory=2Gi --cpu=2 --max-instances=1 --cpu-boost' + + - name: Update Cloud Scheduler + run: | + chmod +x setup_scheduler.sh + ./setup_scheduler.sh diff --git a/.gitignore b/.gitignore index b686665..2f5f69a 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,5 @@ ENV/ # Output files all_company_emails.json +venv/ +service_account.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..044b5bc --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "python-envs.defaultEnvManager": "ms-python.python:venv", + "python-envs.pythonProjects": [], + "githubPullRequests.ignoredPullRequestBranches": [ + "main" + ] +} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 181ffe0..c3ab170 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,17 +1,35 @@ -# Use the official lightweight Python image. -# https://hub.docker.com/_/python -FROM python:3.10-slim +# syntax=docker/dockerfile:1 -# Allow statements and log messages to immediately appear in the Knative logs -ENV PYTHONUNBUFFERED True +# Stage 1: builder — install dependencies into /app/deps +FROM python:3.10-slim@sha256:4ba18b066cee17f2696cf9a2ba564d7d5eb05a91d6a949326780aa7c6912160d AS builder + +ENV PYTHONUNBUFFERED=True +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir --target=/app/deps -r requirements.txt + + +# Stage 2: runtime — non-root user, copy deps and app code +FROM python:3.10-slim@sha256:4ba18b066cee17f2696cf9a2ba564d7d5eb05a91d6a949326780aa7c6912160d AS runtime + +ENV PYTHONUNBUFFERED=True +ENV APP_HOME=/app +ENV PYTHONPATH=/app/deps +ENV PATH="/app/deps/bin:${PATH}" -# Copy local code to the container image. -ENV APP_HOME /app WORKDIR $APP_HOME -COPY . ./ -# Install production dependencies. -RUN pip install --no-cache-dir -r requirements.txt +# Copy installed dependencies from builder +COPY --from=builder /app/deps /app/deps + +# Copy application source files +COPY gmail_scraper.py . +COPY main.py . + +# Run as nonroot (uid 65532) +RUN useradd --no-log-init -u 65532 -r -s /sbin/nologin nonroot +USER nonroot # Run the web service on container startup. # Use functions-framework to run the function diff --git a/README.md b/README.md index b51f8f6..0f368b5 100644 --- a/README.md +++ b/README.md @@ -1 +1,4 @@ -# gmail \ No newline at end of file +# Gmail Scraper (Context Layer Ingestion) + +This service ingests domain-wide Gmail into BigQuery for the Envision context layer. +Operational details and deployment instructions live in `README_CLOUDRUN.md`. diff --git a/cloudbuild.yaml b/cloudbuild.yaml index f687ff4..511978f 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -34,11 +34,11 @@ steps: args: - '-c' - | - gcloud scheduler jobs delete gmail-scraper-hourly \ + gcloud scheduler jobs delete gmail-scraper-5min \ --location=us-central1 \ --quiet 2>/dev/null || echo "No existing job to delete" - # Step 3: Create hourly scheduler job + # Step 3: Create 5-minute scheduler job - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk' id: 'create-scheduler' entrypoint: bash @@ -49,16 +49,16 @@ steps: --region=us-central1 \ --format='value(status.url)') - gcloud scheduler jobs create http gmail-scraper-hourly \ + gcloud scheduler jobs create http gmail-scraper-5min \ --location=us-central1 \ - --schedule="0 * * * *" \ + --schedule="*/5 * * * *" \ --time-zone="America/New_York" \ --uri="$${SERVICE_URL}/" \ --http-method=POST \ --headers="Content-Type=application/json" \ --message-body='{"incremental": true, "max_per_user": 100}' \ --attempt-deadline=3600s \ - --description="Hourly incremental Gmail scrape to BigQuery" + --description="5-minute incremental Gmail scrape to BigQuery" options: logging: CLOUD_LOGGING_ONLY diff --git a/deploy.sh b/deploy.sh index 6c83843..5beb83f 100755 --- a/deploy.sh +++ b/deploy.sh @@ -9,7 +9,7 @@ REGION="us-central1" SERVICE_ACCOUNT_EMAIL="claude-service-account@claude-mcp-457317.iam.gserviceaccount.com" SERVICE_ACCOUNT_KEY_FILE="$HOME/claude-mcp-457317-069a2a199017.json" ADMIN_EMAIL="avi@envsn.com" -SCHEDULER_JOB_NAME="gmail-scraper-hourly" +SCHEDULER_JOB_NAME="gmail-scraper-5min" echo "=== Gmail Scraper Cloud Run Deployment ===" echo "Project: $PROJECT_ID" @@ -48,7 +48,7 @@ echo "Service URL: $SERVICE_URL" # Step 4: Set up Cloud Scheduler for hourly incremental scraping echo "" -echo "Step 4: Setting up hourly Cloud Scheduler job..." +echo "Step 4: Setting up 5-minute Cloud Scheduler job..." # Delete existing job if it exists gcloud scheduler jobs delete $SCHEDULER_JOB_NAME \ @@ -56,18 +56,18 @@ gcloud scheduler jobs delete $SCHEDULER_JOB_NAME \ --location=$REGION \ --quiet 2>/dev/null || true -# Create new scheduler job (runs every hour at minute 0) +# Create new scheduler job (runs every 5 minutes) gcloud scheduler jobs create http $SCHEDULER_JOB_NAME \ --project=$PROJECT_ID \ --location=$REGION \ - --schedule="0 * * * *" \ + --schedule="*/5 * * * *" \ --time-zone="America/New_York" \ --uri="${SERVICE_URL}/" \ --http-method=POST \ --headers="Content-Type=application/json" \ --message-body='{"incremental": true, "max_per_user": 100}' \ --attempt-deadline=3600s \ - --description="Hourly incremental Gmail scrape to BigQuery" + --description="5-minute incremental Gmail scrape to BigQuery" echo "" echo "=== Deployment Complete ===" @@ -77,7 +77,7 @@ echo " URL: $SERVICE_URL" echo "" echo "Cloud Scheduler Job:" echo " Name: $SCHEDULER_JOB_NAME" -echo " Schedule: Every hour at minute 0 (0 * * * *)" +echo " Schedule: Every 5 minutes (*/5 * * * *)" echo " Timezone: America/New_York" echo " Mode: Incremental (only new messages)" echo "" diff --git a/gmail_scraper.py b/gmail_scraper.py index e497568..c07592f 100644 --- a/gmail_scraper.py +++ b/gmail_scraper.py @@ -4,30 +4,30 @@ import json import os import base64 -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from email.utils import parsedate_to_datetime # Service account configuration SERVICE_ACCOUNT_FILE = os.getenv('SERVICE_ACCOUNT_FILE', 'service-account-key.json') -SCOPES = [ - 'https://www.googleapis.com/auth/gmail.readonly', - 'https://www.googleapis.com/auth/admin.directory.user.readonly', - 'https://www.googleapis.com/auth/bigquery' -] + +# Scopes definitions +BQ_SCOPES = ['https://www.googleapis.com/auth/bigquery'] +GMAIL_SCOPES = ['https://www.googleapis.com/auth/gmail.readonly'] +ADMIN_SCOPES = ['https://www.googleapis.com/auth/admin.directory.user.readonly'] # BigQuery configuration PROJECT_ID = os.getenv('PROJECT_ID', 'claude-mcp-457317') DATASET_ID = os.getenv('DATASET_ID', 'gmail_analytics') TABLE_ID = os.getenv('TABLE_ID', 'messages') -def get_credentials(): - """Get service account credentials.""" +def get_service_account_credentials(scopes): + """Get base service account credentials with specific scopes.""" return service_account.Credentials.from_service_account_file( - SERVICE_ACCOUNT_FILE, scopes=SCOPES) + SERVICE_ACCOUNT_FILE, scopes=scopes) def get_bigquery_client(): - """Get BigQuery client.""" - credentials = get_credentials() + """Get BigQuery client using service account identity.""" + credentials = get_service_account_credentials(BQ_SCOPES) return bigquery.Client(project=PROJECT_ID, credentials=credentials) def ensure_table_exists(client): @@ -206,7 +206,7 @@ def process_message(message, user_email): 'has_attachments': has_attachments, 'attachment_count': attachment_count, 'size_estimate': message.get('sizeEstimate'), - 'scraped_at': datetime.utcnow().isoformat(), + 'scraped_at': datetime.now(timezone.utc).isoformat(), } def insert_to_bigquery(client, table_ref, rows): @@ -222,7 +222,8 @@ def insert_to_bigquery(client, table_ref, rows): def get_all_users(admin_email): """Get all users in the Google Workspace domain.""" - credentials = get_credentials() + # Use Admin SDK scopes for this operation + credentials = get_service_account_credentials(ADMIN_SCOPES) delegated_creds = credentials.with_subject(admin_email) admin_service = build('admin', 'directory_v1', credentials=delegated_creds) @@ -246,7 +247,8 @@ def get_all_users(admin_email): def scrape_user_emails(user_email, query='', max_results=100, existing_ids=None): """Scrape emails for a specific user, skipping already-scraped messages.""" - credentials = get_credentials() + # Use Gmail scopes for this operation + credentials = get_service_account_credentials(GMAIL_SCOPES) delegated_creds = credentials.with_subject(user_email) gmail_service = build('gmail', 'v1', credentials=delegated_creds) @@ -370,7 +372,7 @@ def main(query='', max_per_user=100, incremental=True): results['errors'].append(error_msg) results['status'] = 'completed' - results['completed_at'] = datetime.utcnow().isoformat() + results['completed_at'] = datetime.now(timezone.utc).isoformat() except Exception as e: results['status'] = 'failed' diff --git a/setup_scheduler.sh b/setup_scheduler.sh index 3e7436a..d98c999 100755 --- a/setup_scheduler.sh +++ b/setup_scheduler.sh @@ -6,7 +6,7 @@ set -e # Configuration PROJECT_ID="claude-mcp-457317" REGION="us-central1" -JOB_NAME="gmail-scraper-hourly" +JOB_NAME="gmail-scraper-5min" SERVICE_NAME="gmail-scraper" echo "=== Cloud Scheduler Setup for Gmail Scraper ===" @@ -40,17 +40,17 @@ gcloud scheduler jobs delete $JOB_NAME \ --quiet 2>/dev/null || true # Create the scheduler job -# Runs every hour at minute 0 +# Runs every 5 minutes gcloud scheduler jobs create http $JOB_NAME \ --project=$PROJECT_ID \ --location=$REGION \ - --schedule="0 * * * *" \ + --schedule="*/5 * * * *" \ --time-zone="America/New_York" \ --uri="${SERVICE_URL}/" \ --http-method=POST \ --headers="Content-Type=application/json" \ --message-body='{"incremental": true, "max_per_user": 100}' \ - --attempt-deadline=3600s \ + --attempt-deadline=1800s \ --description="Hourly incremental Gmail scrape to BigQuery" echo ""