Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
.git
__pycache__
*.pyc
*.md
.env*
.claude
.omc
.planning
tests
docs
deploy
*.log
51 changes: 51 additions & 0 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
name: Deploy to Cloud Run

on:
push:
branches:
- main

env:
PROJECT_ID: claude-mcp-457317
REGION: us-central1
SERVICE_NAME: gmail-scraper
SERVICE_ACCOUNT: claude-service-account@claude-mcp-457317.iam.gserviceaccount.com

jobs:
deploy:
runs-on: ubuntu-latest
permissions:
contents: read
id-token: write

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Google Auth
id: auth
uses: google-github-actions/auth@v2
with:
workload_identity_provider: ${{ secrets.WIF_PROVIDER }}
service_account: claude-service-account@claude-mcp-457317.iam.gserviceaccount.com

- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v2

- name: Deploy to Cloud Run
uses: google-github-actions/deploy-cloudrun@v2
with:
service: ${{ env.SERVICE_NAME }}
region: ${{ env.REGION }}
source: .
env_vars: |
PROJECT_ID=${{ env.PROJECT_ID }}
DATASET_ID=gmail_analytics
TABLE_ID=messages
ADMIN_EMAIL=avi@envsn.com
flags: '--allow-unauthenticated --service-account=${{ env.SERVICE_ACCOUNT }} --timeout=3600 --memory=2Gi --cpu=2 --max-instances=1 --cpu-boost'

- name: Update Cloud Scheduler
run: |
chmod +x setup_scheduler.sh
./setup_scheduler.sh
Comment thread Fixed
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,5 @@ ENV/

# Output files
all_company_emails.json
venv/
service_account.json
7 changes: 7 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"python-envs.defaultEnvManager": "ms-python.python:venv",
"python-envs.pythonProjects": [],
"githubPullRequests.ignoredPullRequestBranches": [
"main"
]
}
38 changes: 28 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,17 +1,35 @@
# Use the official lightweight Python image.
# https://hub.docker.com/_/python
FROM python:3.10-slim
# syntax=docker/dockerfile:1

# Allow statements and log messages to immediately appear in the Knative logs
ENV PYTHONUNBUFFERED True
# Stage 1: builder — install dependencies into /app/deps
FROM python:3.10-slim@sha256:4ba18b066cee17f2696cf9a2ba564d7d5eb05a91d6a949326780aa7c6912160d AS builder

ENV PYTHONUNBUFFERED=True
WORKDIR /app

COPY requirements.txt .
RUN pip install --no-cache-dir --target=/app/deps -r requirements.txt


# Stage 2: runtime — non-root user, copy deps and app code
FROM python:3.10-slim@sha256:4ba18b066cee17f2696cf9a2ba564d7d5eb05a91d6a949326780aa7c6912160d AS runtime

ENV PYTHONUNBUFFERED=True
ENV APP_HOME=/app
ENV PYTHONPATH=/app/deps
ENV PATH="/app/deps/bin:${PATH}"

# Copy local code to the container image.
ENV APP_HOME /app
WORKDIR $APP_HOME
COPY . ./

# Install production dependencies.
RUN pip install --no-cache-dir -r requirements.txt
# Copy installed dependencies from builder
COPY --from=builder /app/deps /app/deps

# Copy application source files
COPY gmail_scraper.py .
COPY main.py .

# Run as nonroot (uid 65532)
RUN useradd --no-log-init -u 65532 -r -s /sbin/nologin nonroot
USER nonroot

# Run the web service on container startup.
# Use functions-framework to run the function
Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
# gmail
# Gmail Scraper (Context Layer Ingestion)

This service ingests domain-wide Gmail into BigQuery for the Envision context layer.
Operational details and deployment instructions live in `README_CLOUDRUN.md`.
10 changes: 5 additions & 5 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ steps:
args:
- '-c'
- |
gcloud scheduler jobs delete gmail-scraper-hourly \
gcloud scheduler jobs delete gmail-scraper-5min \
--location=us-central1 \
--quiet 2>/dev/null || echo "No existing job to delete"

# Step 3: Create hourly scheduler job
# Step 3: Create 5-minute scheduler job
- name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
id: 'create-scheduler'
entrypoint: bash
Expand All @@ -49,16 +49,16 @@ steps:
--region=us-central1 \
--format='value(status.url)')

gcloud scheduler jobs create http gmail-scraper-hourly \
gcloud scheduler jobs create http gmail-scraper-5min \
--location=us-central1 \
--schedule="0 * * * *" \
--schedule="*/5 * * * *" \
--time-zone="America/New_York" \
--uri="$${SERVICE_URL}/" \
--http-method=POST \
--headers="Content-Type=application/json" \
--message-body='{"incremental": true, "max_per_user": 100}' \
--attempt-deadline=3600s \
--description="Hourly incremental Gmail scrape to BigQuery"
--description="5-minute incremental Gmail scrape to BigQuery"

options:
logging: CLOUD_LOGGING_ONLY
Expand Down
12 changes: 6 additions & 6 deletions deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ REGION="us-central1"
SERVICE_ACCOUNT_EMAIL="claude-service-account@claude-mcp-457317.iam.gserviceaccount.com"
SERVICE_ACCOUNT_KEY_FILE="$HOME/claude-mcp-457317-069a2a199017.json"
ADMIN_EMAIL="avi@envsn.com"
SCHEDULER_JOB_NAME="gmail-scraper-hourly"
SCHEDULER_JOB_NAME="gmail-scraper-5min"

echo "=== Gmail Scraper Cloud Run Deployment ==="
echo "Project: $PROJECT_ID"
Expand Down Expand Up @@ -48,26 +48,26 @@ echo "Service URL: $SERVICE_URL"

# Step 4: Set up Cloud Scheduler for hourly incremental scraping
echo ""
echo "Step 4: Setting up hourly Cloud Scheduler job..."
echo "Step 4: Setting up 5-minute Cloud Scheduler job..."

# Delete existing job if it exists
gcloud scheduler jobs delete $SCHEDULER_JOB_NAME \
--project=$PROJECT_ID \
--location=$REGION \
--quiet 2>/dev/null || true

# Create new scheduler job (runs every hour at minute 0)
# Create new scheduler job (runs every 5 minutes)
gcloud scheduler jobs create http $SCHEDULER_JOB_NAME \
--project=$PROJECT_ID \
--location=$REGION \
--schedule="0 * * * *" \
--schedule="*/5 * * * *" \
--time-zone="America/New_York" \
--uri="${SERVICE_URL}/" \
--http-method=POST \
--headers="Content-Type=application/json" \
--message-body='{"incremental": true, "max_per_user": 100}' \
--attempt-deadline=3600s \
--description="Hourly incremental Gmail scrape to BigQuery"
--description="5-minute incremental Gmail scrape to BigQuery"

echo ""
echo "=== Deployment Complete ==="
Expand All @@ -77,7 +77,7 @@ echo " URL: $SERVICE_URL"
echo ""
echo "Cloud Scheduler Job:"
echo " Name: $SCHEDULER_JOB_NAME"
echo " Schedule: Every hour at minute 0 (0 * * * *)"
echo " Schedule: Every 5 minutes (*/5 * * * *)"
echo " Timezone: America/New_York"
echo " Mode: Incremental (only new messages)"
echo ""
Expand Down
32 changes: 17 additions & 15 deletions gmail_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,30 @@
import json
import os
import base64
from datetime import datetime, timedelta
from datetime import datetime, timedelta, timezone
from email.utils import parsedate_to_datetime

# Service account configuration
SERVICE_ACCOUNT_FILE = os.getenv('SERVICE_ACCOUNT_FILE', 'service-account-key.json')
SCOPES = [
'https://www.googleapis.com/auth/gmail.readonly',
'https://www.googleapis.com/auth/admin.directory.user.readonly',
'https://www.googleapis.com/auth/bigquery'
]

# Scopes definitions
BQ_SCOPES = ['https://www.googleapis.com/auth/bigquery']
GMAIL_SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
ADMIN_SCOPES = ['https://www.googleapis.com/auth/admin.directory.user.readonly']

# BigQuery configuration
PROJECT_ID = os.getenv('PROJECT_ID', 'claude-mcp-457317')
DATASET_ID = os.getenv('DATASET_ID', 'gmail_analytics')
TABLE_ID = os.getenv('TABLE_ID', 'messages')

def get_credentials():
"""Get service account credentials."""
def get_service_account_credentials(scopes):
"""Get base service account credentials with specific scopes."""
return service_account.Credentials.from_service_account_file(
SERVICE_ACCOUNT_FILE, scopes=SCOPES)
SERVICE_ACCOUNT_FILE, scopes=scopes)

def get_bigquery_client():
"""Get BigQuery client."""
credentials = get_credentials()
"""Get BigQuery client using service account identity."""
credentials = get_service_account_credentials(BQ_SCOPES)
return bigquery.Client(project=PROJECT_ID, credentials=credentials)

def ensure_table_exists(client):
Expand Down Expand Up @@ -206,7 +206,7 @@ def process_message(message, user_email):
'has_attachments': has_attachments,
'attachment_count': attachment_count,
'size_estimate': message.get('sizeEstimate'),
'scraped_at': datetime.utcnow().isoformat(),
'scraped_at': datetime.now(timezone.utc).isoformat(),
}

def insert_to_bigquery(client, table_ref, rows):
Expand All @@ -222,7 +222,8 @@ def insert_to_bigquery(client, table_ref, rows):

def get_all_users(admin_email):
"""Get all users in the Google Workspace domain."""
credentials = get_credentials()
# Use Admin SDK scopes for this operation
credentials = get_service_account_credentials(ADMIN_SCOPES)
delegated_creds = credentials.with_subject(admin_email)
admin_service = build('admin', 'directory_v1', credentials=delegated_creds)

Expand All @@ -246,7 +247,8 @@ def get_all_users(admin_email):

def scrape_user_emails(user_email, query='', max_results=100, existing_ids=None):
"""Scrape emails for a specific user, skipping already-scraped messages."""
credentials = get_credentials()
# Use Gmail scopes for this operation
credentials = get_service_account_credentials(GMAIL_SCOPES)
delegated_creds = credentials.with_subject(user_email)
gmail_service = build('gmail', 'v1', credentials=delegated_creds)

Expand Down Expand Up @@ -370,7 +372,7 @@ def main(query='', max_per_user=100, incremental=True):
results['errors'].append(error_msg)

results['status'] = 'completed'
results['completed_at'] = datetime.utcnow().isoformat()
results['completed_at'] = datetime.now(timezone.utc).isoformat()

except Exception as e:
results['status'] = 'failed'
Expand Down
8 changes: 4 additions & 4 deletions setup_scheduler.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ set -e
# Configuration
PROJECT_ID="claude-mcp-457317"
REGION="us-central1"
JOB_NAME="gmail-scraper-hourly"
JOB_NAME="gmail-scraper-5min"
SERVICE_NAME="gmail-scraper"

echo "=== Cloud Scheduler Setup for Gmail Scraper ==="
Expand Down Expand Up @@ -40,17 +40,17 @@ gcloud scheduler jobs delete $JOB_NAME \
--quiet 2>/dev/null || true

# Create the scheduler job
# Runs every hour at minute 0
# Runs every 5 minutes
gcloud scheduler jobs create http $JOB_NAME \
--project=$PROJECT_ID \
--location=$REGION \
--schedule="0 * * * *" \
--schedule="*/5 * * * *" \
--time-zone="America/New_York" \
--uri="${SERVICE_URL}/" \
--http-method=POST \
--headers="Content-Type=application/json" \
--message-body='{"incremental": true, "max_per_user": 100}' \
--attempt-deadline=3600s \
--attempt-deadline=1800s \
--description="Hourly incremental Gmail scrape to BigQuery"

echo ""
Expand Down
Loading