From 6ee3ba001a847552a76139204cddca27f0210380 Mon Sep 17 00:00:00 2001 From: Antonio Fernandez Date: Tue, 24 Feb 2026 22:43:13 -0600 Subject: [PATCH] Add Google Forms API integration for questionnaire import (closes #36) Implements complete Google Forms API integration allowing users to import questionnaires directly from Google Forms URLs using simple API key authentication. Features: - URL and form ID parsing with regex pattern matching - Google Forms API v1 integration using google-api-python-client - Support for multiple question types: multiple choice, checkboxes, dropdown, linear scale, text questions, and grid questions - Automatic language detection using langdetect - Comprehensive metadata extraction (form_id, title, description) - Complete test suite with mock data for testing without API calls - Documentation in README.md with setup instructions and usage examples Technical details: - Authentication: API key via GOOGLE_FORMS_API_KEY environment variable - New FileType: google_forms enum value - Parser pattern: convert_google_forms_to_instruments(file: RawFile) -> List[Instrument] - Error handling for HttpError (403, 404, 429) and invalid inputs Files added: - src/harmony/parsing/google_forms_parser.py (~350 lines) - tests/test_google_forms_parser.py (comprehensive test suite) Files modified: - requirements.txt: Added google-api-python-client>=2.147.0 - pyproject.toml: Added google-api-python-client dependency - src/harmony/schemas/enums/file_types.py: Added google_forms enum - src/harmony/parsing/wrapper_all_parsers.py: Added import and routing - README.md: Added Google Forms documentation section --- README.md | 69 +++++ pyproject.toml | 1 + requirements.txt | 1 + src/harmony/parsing/google_forms_parser.py | 318 +++++++++++++++++++++ src/harmony/parsing/wrapper_all_parsers.py | 3 + src/harmony/schemas/enums/file_types.py | 1 + tests/test_google_forms_parser.py | 249 ++++++++++++++++ 7 files changed, 642 insertions(+) create mode 100644 src/harmony/parsing/google_forms_parser.py create mode 100644 tests/test_google_forms_parser.py diff --git a/README.md b/README.md index ed26dc4..c9b519f 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,75 @@ from harmony import load_instruments_from_local_file instruments = load_instruments_from_local_file("gad-7.pdf") ``` +## 📋 Importing from Google Forms + +Harmony can import questionnaires directly from Google Forms URLs, allowing you to harmonise survey instruments that are hosted on Google Forms. + +### Setup + +To use Google Forms integration, you need a Google API key: + +1. Visit the [Google Cloud Console](https://console.cloud.google.com/) +2. Create a new project or select an existing one +3. Enable the Google Forms API for your project +4. Create credentials (API key) for the Google Forms API +5. Set the API key as an environment variable: + +```bash +export GOOGLE_FORMS_API_KEY="your-api-key-here" +``` + +### Usage + +Import questionnaires from Google Forms using the URL or form ID: + +```python +from harmony import convert_files_to_instruments +from harmony.schemas.requests.text import RawFile +from harmony.schemas.enums.file_types import FileType + +# Create a RawFile with the Google Forms URL +file = RawFile( + file_name="Customer Satisfaction Survey", + file_type=FileType.google_forms, + content="https://docs.google.com/forms/d/e/1FAIpQLSc.../viewform" +) + +# Convert to Harmony instruments +instruments = convert_files_to_instruments([file]) + +# Access the questions +for instrument in instruments: + print(f"Form: {instrument.instrument_name}") + for question in instrument.questions: + print(f"{question.question_no}. {question.question_text}") + if question.options: + print(f" Options: {', '.join(question.options)}") +``` + +You can also use the form ID directly instead of the full URL: + +```python +file = RawFile( + file_name="Survey", + file_type=FileType.google_forms, + content="1FAIpQLSc_form_id_here" +) +``` + +### Supported Question Types + +Harmony can extract the following Google Forms question types: + +- **Multiple choice** - Radio buttons with multiple options +- **Checkboxes** - Multiple selection questions +- **Dropdown** - Select from a list +- **Linear scale** - Rating scale questions (e.g., 1-5) +- **Text** - Short answer and paragraph text +- **Grid questions** - Matrix/grid of choices + +Note: The form must be publicly accessible or shared with the appropriate permissions for the API to access it. + ## Matching instruments Once you have some instruments, you can match them with each other with a call to `match_instruments`. diff --git a/pyproject.toml b/pyproject.toml index 66cec7f..6c0a6dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,6 +59,7 @@ dependencies = [ "torch==2.2.2; python_version <= '3.13'", "transformers==4.50.3; python_version <= '3.13'", "fpdf2~=2.8.2; python_version <= '3.13'", + "google-api-python-client>=2.147.0; python_version <= '3.13'", ] [project.optional-dependencies] diff --git a/requirements.txt b/requirements.txt index 50f78e6..af6605d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,3 +17,4 @@ scipy==1.14.1 torch==2.2.2 transformers==4.50.3 fpdf2~=2.8.2 +google-api-python-client>=2.147.0 diff --git a/src/harmony/parsing/google_forms_parser.py b/src/harmony/parsing/google_forms_parser.py new file mode 100644 index 0000000..649fb8a --- /dev/null +++ b/src/harmony/parsing/google_forms_parser.py @@ -0,0 +1,318 @@ +''' +MIT License + +Copyright (c) 2023 Ulster University (https://www.ulster.ac.uk). +Project: Harmony (https://harmonydata.ac.uk) +Maintainer: Thomas Wood (https://fastdatascience.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +''' + +import os +import re +import traceback +from typing import List, Optional, Dict, Any +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError +from langdetect import detect + +from harmony.schemas.requests.text import RawFile, Instrument, Question +from harmony.schemas.enums.file_types import FileType + + +def extract_form_id_from_url(url: str) -> Optional[str]: + """ + Extract Google Forms form ID from various URL formats. + + Supports: + - https://docs.google.com/forms/d/FORM_ID/edit + - https://docs.google.com/forms/d/e/FORM_ID/viewform + - Direct form ID + + Args: + url: Google Forms URL or form ID + + Returns: + Form ID string or None if not found + + Examples: + >>> extract_form_id_from_url("https://docs.google.com/forms/d/1FAIpQLSc.../viewform") + '1FAIpQLSc...' + >>> extract_form_id_from_url("1FAIpQLSc...") + '1FAIpQLSc...' + """ + # Pattern for extracting form ID from Google Forms URLs + # Matches: /d/FORM_ID or /d/e/FORM_ID + pattern = r'docs\.google\.com/forms/d/(?:e/)?([a-zA-Z0-9_-]+)' + + match = re.search(pattern, url) + if match: + return match.group(1) + + # If no match and input looks like a form ID (alphanumeric, hyphens, underscores) + if re.match(r'^[a-zA-Z0-9_-]+$', url): + return url + + return None + + +def fetch_form_structure(form_id: str, api_key: Optional[str] = None) -> Dict[str, Any]: + """ + Fetch form structure from Google Forms API using API key authentication. + + Args: + form_id: Google Forms form ID + api_key: Google API key (if None, reads from GOOGLE_FORMS_API_KEY env var) + + Returns: + Dictionary containing form structure from API + + Raises: + ValueError: If API key is not provided + HttpError: If API request fails + """ + if api_key is None: + api_key = os.environ.get('GOOGLE_FORMS_API_KEY') + + if not api_key: + raise ValueError( + "Google Forms API key not found. " + "Please set GOOGLE_FORMS_API_KEY environment variable or provide api_key parameter." + ) + + try: + # Build service with API key (no OAuth required) + service = build('forms', 'v1', developerKey=api_key) + + # Fetch form + result = service.forms().get(formId=form_id).execute() + return result + + except HttpError as error: + error_details = error.error_details if hasattr(error, 'error_details') else str(error) + print(f"Error fetching Google Form {form_id}: {error_details}") + traceback.print_exc() + raise + + +def parse_question_item(item: Dict[str, Any], question_number: int) -> Optional[Question]: + """ + Parse a single question item from Google Forms API response. + + Args: + item: Dictionary containing question item data + question_number: Sequential number for the question + + Returns: + Question object or None if item is not a question + """ + # Skip if not a question item + if 'questionItem' not in item: + return None + + question_item = item['questionItem'] + question_data = question_item.get('question', {}) + + # Extract question text from title + question_text = item.get('title', '').strip() + if not question_text: + return None + + # Extract description as intro if available + question_intro = item.get('description', '') + + # Extract options based on question type + options = [] + + # Multiple choice, checkbox, or dropdown + if 'choiceQuestion' in question_data: + choice_question = question_data['choiceQuestion'] + choice_options = choice_question.get('options', []) + options = [opt.get('value', '') for opt in choice_options if opt.get('value')] + + # Linear scale (rating scale) + elif 'scaleQuestion' in question_data: + scale_question = question_data['scaleQuestion'] + low = scale_question.get('low', 1) + high = scale_question.get('high', 5) + low_label = scale_question.get('lowLabel', '') + high_label = scale_question.get('highLabel', '') + + # Create options as range + options = [str(i) for i in range(low, high + 1)] + + # Add labels if present + if low_label or high_label: + if question_intro: + question_intro += f" [{low_label} - {high_label}]" + else: + question_intro = f"[{low_label} - {high_label}]" + + # Grid question (matrix) + elif 'rowQuestion' in question_data: + row_question = question_data['rowQuestion'] + # For grid questions, title is the column header + # We'll extract row options as answer choices + if 'rows' in row_question: + row_items = row_question.get('rows', []) + options = [row.get('title', '') for row in row_items if row.get('title')] + + # Text question, file upload, date/time - no options + # These will have empty options list + + return Question( + question_no=str(question_number), + question_intro=question_intro, + question_text=question_text, + options=options, + source_page=0 + ) + + +def parse_google_form_structure(form_data: Dict[str, Any]) -> Instrument: + """ + Parse complete Google Form structure into Harmony Instrument. + + Args: + form_data: Complete form structure from Google Forms API + + Returns: + Instrument object with all questions and metadata + """ + # Extract form metadata + info = form_data.get('info', {}) + form_title = info.get('title', 'Untitled Google Form') + form_description = info.get('description', '') + + # Extract document ID + form_id = form_data.get('formId', '') + + # Parse all question items + questions = [] + items = form_data.get('items', []) + + question_counter = 0 + for item in items: + question = parse_question_item(item, question_counter + 1) + if question: + questions.append(question) + question_counter += 1 + + # Detect language from questions + language = "en" + if questions: + try: + # Combine first few questions for language detection + sample_text = " ".join([q.question_text for q in questions[:5] if q.question_text]) + if sample_text: + language = detect(sample_text) + except: + print("Error identifying language in Google Form") + traceback.print_exc() + language = "en" + + # Create metadata dictionary + metadata = { + 'form_id': form_id, + 'title': form_title, + 'description': form_description, + 'source': 'google_forms', + 'document_title': info.get('documentTitle', ''), + } + + # Create instrument + instrument = Instrument( + file_id=form_id, + instrument_id=form_id, + instrument_name=form_title, + file_name=f"{form_title}.google_forms", + file_type=FileType.google_forms, + file_section="", + language=language, + questions=questions, + metadata=metadata + ) + + return instrument + + +def convert_google_forms_to_instruments(file: RawFile) -> List[Instrument]: + """ + Convert Google Forms URL or form ID to Harmony instruments. + + This function handles two scenarios: + 1. content contains a Google Forms URL or form ID + 2. content contains pre-fetched JSON structure from Google Forms API + + Args: + file: RawFile object with content containing: + - Google Forms URL (e.g., https://docs.google.com/forms/d/e/1FAIpQLSc.../viewform) + - Form ID directly + - JSON structure from Forms API (for testing) + + Returns: + List containing single Instrument object with parsed questions + + Raises: + ValueError: If form ID cannot be extracted or API key is missing + HttpError: If API request fails + + Examples: + >>> file = RawFile( + ... file_name="Survey.google_forms", + ... file_type=FileType.google_forms, + ... content="https://docs.google.com/forms/d/e/1FAIpQLSc.../viewform" + ... ) + >>> instruments = convert_google_forms_to_instruments(file) + """ + try: + content = file.content.strip() + + # Check if content is JSON (pre-fetched structure for testing) + if content.startswith('{'): + import json + form_data = json.loads(content) + else: + # Extract form ID from URL or direct ID + form_id = extract_form_id_from_url(content) + if not form_id: + raise ValueError( + f"Could not extract form ID from: {content}. " + "Please provide a valid Google Forms URL or form ID." + ) + + # Fetch form structure from API + form_data = fetch_form_structure(form_id) + + # Parse form structure + instrument = parse_google_form_structure(form_data) + + # Update file_name if it was default + if file.file_name == "Untitled file" or not file.file_name: + instrument.file_name = instrument.instrument_name + else: + instrument.file_name = file.file_name + + return [instrument] + + except Exception as e: + print(f"Error converting Google Form: {str(e)}") + traceback.print_exc() + raise diff --git a/src/harmony/parsing/wrapper_all_parsers.py b/src/harmony/parsing/wrapper_all_parsers.py index 4964eed..6ef2545 100644 --- a/src/harmony/parsing/wrapper_all_parsers.py +++ b/src/harmony/parsing/wrapper_all_parsers.py @@ -25,6 +25,7 @@ from harmony.parsing.pdf_parser import convert_pdf_to_instruments from harmony.parsing.text_parser import convert_text_to_instruments from harmony.parsing.html_parser import convert_html_to_instruments +from harmony.parsing.google_forms_parser import convert_google_forms_to_instruments from harmony.schemas.enums.file_types import FileType from harmony.schemas.requests.text import RawFile, Instrument @@ -47,6 +48,8 @@ def _get_instruments_from_file(file): instruments_from_this_file = convert_excel_to_instruments(file) elif file.file_type == FileType.html or file.file_type == FileType.htm: instruments_from_this_file = convert_html_to_instruments(file) + elif file.file_type == FileType.google_forms: + instruments_from_this_file = convert_google_forms_to_instruments(file) else: instruments_from_this_file = [] return instruments_from_this_file diff --git a/src/harmony/schemas/enums/file_types.py b/src/harmony/schemas/enums/file_types.py index 6160a77..9f1846a 100644 --- a/src/harmony/schemas/enums/file_types.py +++ b/src/harmony/schemas/enums/file_types.py @@ -32,3 +32,4 @@ class FileType(str, Enum): docx: str = 'docx' html: str = 'html' htm: str = 'htm' + google_forms: str = 'google_forms' diff --git a/tests/test_google_forms_parser.py b/tests/test_google_forms_parser.py new file mode 100644 index 0000000..f0143b2 --- /dev/null +++ b/tests/test_google_forms_parser.py @@ -0,0 +1,249 @@ +''' +MIT License + +Copyright (c) 2023 Ulster University (https://www.ulster.ac.uk). +Project: Harmony (https://harmonydata.ac.uk) +Maintainer: Thomas Wood (https://fastdatascience.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +''' + +import sys +import unittest +import json + +sys.path.append("../src") + +from harmony.parsing.google_forms_parser import ( + extract_form_id_from_url, + parse_question_item, + parse_google_form_structure, + convert_google_forms_to_instruments +) +from harmony.schemas.requests.text import RawFile +from harmony.schemas.enums.file_types import FileType + + +# Mock data for testing without API calls +MOCK_FORM_RESPONSE = { + "formId": "test_form_12345", + "info": { + "title": "GAD-7 Anxiety Scale", + "description": "Generalized Anxiety Disorder Assessment", + "documentTitle": "GAD-7 Form" + }, + "items": [ + { + "title": "Feeling nervous, anxious or on edge", + "description": "", + "questionItem": { + "question": { + "choiceQuestion": { + "type": "RADIO", + "options": [ + {"value": "Not at all"}, + {"value": "Several days"}, + {"value": "More than half the days"}, + {"value": "Nearly every day"} + ] + } + } + } + }, + { + "title": "Not being able to stop or control worrying", + "description": "", + "questionItem": { + "question": { + "choiceQuestion": { + "type": "RADIO", + "options": [ + {"value": "Not at all"}, + {"value": "Several days"}, + {"value": "More than half the days"}, + {"value": "Nearly every day"} + ] + } + } + } + }, + { + "title": "Rate your overall anxiety level", + "description": "On a scale from 1 to 5", + "questionItem": { + "question": { + "scaleQuestion": { + "low": 1, + "high": 5, + "lowLabel": "Not anxious", + "highLabel": "Extremely anxious" + } + } + } + }, + { + "title": "Please provide additional comments", + "description": "", + "questionItem": { + "question": { + "textQuestion": { + "paragraph": True + } + } + } + }, + { + "title": "Section Header", + "description": "This is just a section, not a question" + } + ] +} + + +class TestExtractFormId(unittest.TestCase): + """Test form ID extraction from various URL formats.""" + + def test_extract_from_viewform_url(self): + """Test extraction from standard viewform URL.""" + url = "https://docs.google.com/forms/d/e/1FAIpQLSc_example_id_12345/viewform" + form_id = extract_form_id_from_url(url) + self.assertEqual(form_id, "1FAIpQLSc_example_id_12345") + + def test_extract_from_edit_url(self): + """Test extraction from edit URL.""" + url = "https://docs.google.com/forms/d/1FAIpQLSc_another_id_67890/edit" + form_id = extract_form_id_from_url(url) + self.assertEqual(form_id, "1FAIpQLSc_another_id_67890") + + def test_extract_from_direct_id(self): + """Test extraction when input is already a form ID.""" + direct_id = "1FAIpQLSc_direct_form_id_xyz" + form_id = extract_form_id_from_url(direct_id) + self.assertEqual(form_id, direct_id) + + def test_invalid_url(self): + """Test extraction from invalid URL returns None.""" + invalid_url = "https://example.com/not-a-google-form" + form_id = extract_form_id_from_url(invalid_url) + self.assertIsNone(form_id) + + +class TestParseQuestionItem(unittest.TestCase): + """Test parsing of individual question items.""" + + def test_parse_multiple_choice(self): + """Test parsing a multiple choice question.""" + item = MOCK_FORM_RESPONSE["items"][0] + question = parse_question_item(item, 1) + + self.assertIsNotNone(question) + self.assertEqual(question.question_no, "1") + self.assertEqual(question.question_text, "Feeling nervous, anxious or on edge") + self.assertEqual(len(question.options), 4) + self.assertIn("Not at all", question.options) + self.assertIn("Nearly every day", question.options) + + def test_parse_scale_question(self): + """Test parsing a linear scale question.""" + item = MOCK_FORM_RESPONSE["items"][2] + question = parse_question_item(item, 3) + + self.assertIsNotNone(question) + self.assertEqual(question.question_no, "3") + self.assertEqual(question.question_text, "Rate your overall anxiety level") + self.assertEqual(question.options, ["1", "2", "3", "4", "5"]) + self.assertIn("Not anxious", question.question_intro) + self.assertIn("Extremely anxious", question.question_intro) + + def test_parse_text_question(self): + """Test parsing a text question (no options).""" + item = MOCK_FORM_RESPONSE["items"][3] + question = parse_question_item(item, 4) + + self.assertIsNotNone(question) + self.assertEqual(question.question_no, "4") + self.assertEqual(question.question_text, "Please provide additional comments") + self.assertEqual(question.options, []) + + def test_skip_non_question_item(self): + """Test that non-question items (sections, page breaks) are skipped.""" + item = MOCK_FORM_RESPONSE["items"][4] + question = parse_question_item(item, 5) + + self.assertIsNone(question) + + +class TestParseGoogleFormStructure(unittest.TestCase): + """Test parsing complete form structure.""" + + def test_parse_complete_form(self): + """Test parsing a complete form structure into Instrument.""" + instrument = parse_google_form_structure(MOCK_FORM_RESPONSE) + + # Verify basic metadata + self.assertEqual(instrument.file_id, "test_form_12345") + self.assertEqual(instrument.instrument_id, "test_form_12345") + self.assertEqual(instrument.instrument_name, "GAD-7 Anxiety Scale") + self.assertEqual(instrument.file_type, FileType.google_forms) + + # Verify questions (should be 4 questions, skipping the section) + self.assertEqual(len(instrument.questions), 4) + + # Verify metadata dictionary + self.assertIsNotNone(instrument.metadata) + self.assertEqual(instrument.metadata["form_id"], "test_form_12345") + self.assertEqual(instrument.metadata["title"], "GAD-7 Anxiety Scale") + self.assertEqual(instrument.metadata["source"], "google_forms") + + # Verify language detection (should default to "en") + self.assertIsNotNone(instrument.language) + + +class TestConvertGoogleFormsToInstruments(unittest.TestCase): + """Test complete conversion workflow.""" + + def test_convert_from_json_content(self): + """Test conversion from pre-fetched JSON content (for testing).""" + file = RawFile( + file_name="GAD-7.google_forms", + file_type=FileType.google_forms, + content=json.dumps(MOCK_FORM_RESPONSE) + ) + + instruments = convert_google_forms_to_instruments(file) + + # Verify we get a list with one instrument + self.assertEqual(len(instruments), 1) + + instrument = instruments[0] + + # Verify instrument properties + self.assertEqual(instrument.instrument_name, "GAD-7 Anxiety Scale") + self.assertEqual(instrument.file_type, FileType.google_forms) + self.assertEqual(len(instrument.questions), 4) + + # Verify first question + first_q = instrument.questions[0] + self.assertEqual(first_q.question_text, "Feeling nervous, anxious or on edge") + self.assertEqual(len(first_q.options), 4) + + +if __name__ == '__main__': + unittest.main()