Aiuanyu · google-labs-jules · Oct 11, 2025 · Oct 12, 2025
diff --git a/build_trie.py b/build_trie.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+import os
+import csv
+import json
+import re
+
+def get_words_from_cert_csv(file_path):
+    """從 CERT CSV 檔案中提取詞彙。"""
+    words = set()
+    with open(file_path, 'r', encoding='utf-8-sig') as f:
+        reader = csv.reader(f)
+        try:
+            header = next(reader)
+        except StopIteration:
+            return set()  # 跳過空檔案
+
+        hakka_word_index = -1
+        # 尋找包含 "客家語" 的欄位索引
+        for i, col in enumerate(header):
+            if "客家語" in col:
+                hakka_word_index = i
+                break
+
+        if hakka_word_index == -1:
+            # print(f"  - 警告: 在 {os.path.basename(file_path)} 中找不到 '客家語' 欄位。")
+            return set()
+
+        for row in reader:
+            if len(row) > hakka_word_index:
+                word = row[hakka_word_index].strip()
+                if word:
+                    # 有些詞條會用 / 分隔，例如 "人客/客人"
+                    for w in re.split(r'[/\s]', word):
+                        if w:
+                            words.add(w)
+    return words
+
+
+def get_words_from_gip_csv(file_path):
+    """從 GIP CSV 檔案中提取詞彙。"""
+    words = set()
+    with open(file_path, 'r', encoding='utf-8-sig') as f:
+        try:
+            reader = csv.DictReader(f)
+            for row in reader:
+                word = row.get('詞目', '').strip()
+                if word:
+                    # 有些詞條會用 / 分隔
+                    for w in re.split(r'[/\s]', word):
+                        if w:
+                            words.add(w)
+        except Exception:
+            # 檔案可能是空的或標頭有問題
+            return set()
+    return words
+
+def build_trie(words):
+    """從一組詞彙建立 Trie 樹。"""
+    root = {}
+    for word in words:
+        node = root
+        for char in word:
+            node = node.setdefault(char, {})
+        node['is_end'] = True
+    return root
+
+def main():
+    """主函式，處理檔案並建立 Trie。"""
+    all_words = set()
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    if script_dir == "/Users/jules/workspace":
+        script_dir = "." # 在本地執行的路徑修正
+
+    cert_dir = os.path.join(script_dir, 'data', 'cert')
+    gip_dir = os.path.join(script_dir, 'data', 'gip')
+
+    print("--- 開始從 CERT 檔案提取詞彙 ---")
+    if os.path.isdir(cert_dir):
+        for filename in sorted(os.listdir(cert_dir)):
+            if filename.endswith('.csv'):
+                file_path = os.path.join(cert_dir, filename)
+                # print(f"> 讀取中: {filename}")
+                words = get_words_from_cert_csv(file_path)
+                all_words.update(words)
+    else:
+        print(f"  ✗ 錯誤: 找不到目錄 '{cert_dir}'")
+
+
+    print("--- 開始從 GIP 檔案提取詞彙 ---")
+    if os.path.isdir(gip_dir):
+        for filename in sorted(os.listdir(gip_dir)):
+            if filename.endswith('.csv'):
+                file_path = os.path.join(gip_dir, filename)
+                # print(f"> 讀取中: {filename}")
+                words = get_words_from_gip_csv(file_path)
+                all_words.update(words)
+    else:
+         print(f"  ✗ 錯誤: 找不到目錄 '{gip_dir}'")
+
+    print(f"--- 總共找到 {len(all_words)} 個不重複的詞彙 ---")
+
+    print("--- 正在建立詞彙樹 (Trie) ---")
+    trie = build_trie(all_words)
+
+    output_path = os.path.join(script_dir, 'trie.json')
+    print(f"--- 正在將詞彙樹寫入到 {output_path} ---")
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(trie, f, ensure_ascii=False)
+
+    print("--- 全部處理完成 ---")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/jules-scratch/verification/verify_segmentation.py b/jules-scratch/verification/verify_segmentation.py
@@ -0,0 +1,73 @@
+import asyncio
+import subprocess
+import time
+from playwright.async_api import async_playwright, expect
+
+async def main():
+    # 啟動一個簡單的 HTTP 伺服器
+    server_process = subprocess.Popen(
+        ['python3', '-m', 'http.server'],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL
+    )
+    time.sleep(1) # 等待伺服器啟動
+
+    try:
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            page = await browser.new_page()
+
+            # 1. 導航到頁面
+            await page.goto('http://localhost:8000/index.html', timeout=60000)
+
+            # 2. 等待 App 初始化完成
+            await expect(page.locator("#loading-indicator")).to_be_hidden(timeout=60000)
+
+            # 3. 關閉所有彈出視窗 (以正確的順序)
+            whats_new_modal = page.locator('#whatsNewModal')
+            if await whats_new_modal.is_visible():
+                await page.locator('#whatsNewModalCloseBtn').click()
+                await expect(whats_new_modal).to_be_hidden()
+
+            info_modal = page.locator('#infoModal')
+            if await info_modal.is_visible():
+                await page.locator('#infoModalCloseBtn').click()
+                await expect(info_modal).to_be_hidden()
+
+            # 4. 點擊方言連結
+            dialect_link = page.locator('span[data-varname="四基"] a')
+            await expect(dialect_link).to_be_visible(timeout=10000)
+            await dialect_link.click()
+
+            # 5. **最終修正**: 等待新的分類面板被渲染出來
+            await expect(page.locator("#cat-panel", has_text="再擇類別：")).to_be_visible(timeout=10000)
+
+            # 6. 現在可以安全地點擊分類了
+            category_radio = page.locator('input[name="category"][value="數字、時間"]')
+            await category_radio.click()
+
+            # 7. 等待表格內容出現
+            await expect(page.locator("#category-table tbody tr")).to_be_visible()
+
+            # 8. 找到並點擊第一個斷詞連結
+            first_segmented_word = page.locator('a.segmented-word').first
+            await expect(first_segmented_word).to_be_visible()
+            await first_segmented_word.click()
+
+            # 9. 等待並驗證查詞 popup
+            popup = page.locator('#selectionPopup')
+            await expect(popup).to_be_visible()
+            word_text = await first_segmented_word.text_content()
+            await expect(popup.locator('#selectionPopupTitle')).to_have_text(f'尋「{word_text}」个讀音')
+
+            # 10. 截圖
+            await page.screenshot(path="jules-scratch/verification/verification.png")
+
+            await browser.close()
+    finally:
+        # 確保伺服器進程被終止
+        server_process.terminate()
+        server_process.wait()
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/main.js b/main.js
@@ -1,3 +1,110 @@
+/**
+ * 使用 Trie 樹對句子進行最大長度匹配斷詞。
+ * @param {string} text - 需要斷詞的純文字句子。
+ * @param {object} trie - 預載入的詞彙樹。
+ * @returns {Array<object>} - 回傳一個物件陣列，每個物件包含 'text' 和 'isWord' 屬性。
+ */
+function segmentSentenceWithTrie(text, trie) {
+    if (!text || !trie) return [{ text: text, isWord: false }];
+
+    const results = [];
+    let currentIndex = 0;
+
+    while (currentIndex < text.length) {
+        let longestMatch = '';
+        let lastFoundIndex = currentIndex;
+
+        let currentNode = trie;
+        for (let i = currentIndex; i < text.length; i++) {
+            const char = text[i];
+            if (currentNode[char]) {
+                currentNode = currentNode[char];
+                if (currentNode.is_end) {
+                    longestMatch = text.substring(currentIndex, i + 1);
+                    lastFoundIndex = i;
+                }
+            } else {
+                break;
+            }
+        }
+
+        if (longestMatch) {
+            results.push({ text: longestMatch, isWord: true });
+            currentIndex = lastFoundIndex + 1;
+        } else {
+            // 如果在目前位置找不到任何詞，就將這個字當作單一非詞字元處理
+            const unmatchableChar = text[currentIndex];
+            // 尋找連續的非詞字元並將它們合併
+            let nonWordEndIndex = currentIndex + 1;
+            while (nonWordEndIndex < text.length) {
+                const nextChar = text[nonWordEndIndex];
+                let nextNode = trie;
+                let isNextCharStartOfWord = false;
+                if (nextNode[nextChar]) {
+                    isNextCharStartOfWord = true;
+                }
+
+                if (isNextCharStartOfWord) {
+                    break;
+                }
+                nonWordEndIndex++;
+            }
+            const nonWordText = text.substring(currentIndex, nonWordEndIndex);
+            results.push({ text: nonWordText, isWord: false });
+            currentIndex = nonWordEndIndex;
+        }
+    }
+    return results;
+}
+
+
+/**
+ * 將斷詞結果應用到指定的 DOM 元素上，將詞彙轉換為可點擊的連結。
+ * @param {HTMLElement} element - 要處理的元素，例如包含例句的 <span> 或 <td>。
+ */
+function applySegmentationToElement(element) {
+    if (!element || !element.textContent || !window.wordTrie) {
+        return;
+    }
+
+    // 處理 <br> 標籤，將其暫時替換為特殊的分隔符號
+    const originalHtml = element.innerHTML;
+    const placeholder = '||BR||';
+    const textSegments = originalHtml.split(/<br\s*\/?>/i);
+
+    let finalHtml = '';
+
+    textSegments.forEach((segment, index) => {
+        // 為了避免重複處理，先建立一個臨時的 div 來取得純文字
+        const tempDiv = document.createElement('div');
+        tempDiv.innerHTML = segment;
+        const plainText = tempDiv.textContent || tempDiv.innerText || '';
+
+        if (plainText) {
+            const segmentedParts = segmentSentenceWithTrie(plainText, window.wordTrie);
+            let segmentHtml = '';
+            segmentedParts.forEach(part => {
+                if (part.isWord) {
+                    segmentHtml += `<a href="#" class="segmented-word" data-word="${part.text}">${part.text}</a>`;
+                } else {
+                    // 對非詞部分進行 HTML 編碼，避免 XSS 風險
+                    const encodedText = part.text.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;').replace(/"/g, '&quot;');
+                    segmentHtml += encodedText;
+                }
+            });
+            finalHtml += segmentHtml;
+        }
+
+        // 在每段之間加回 <br>
+        if (index < textSegments.length - 1) {
+            finalHtml += '<br>';
+        }
+    });
+
+    element.innerHTML = finalHtml;
+}
+
+
 function handleDomainMigration() {
   if (window.location.hostname !== 'aiuanyu.github.io' && window.location.hostname !== 'fix-migration-dark-theme.hakspring.pages.dev' && window.location.hostname !== 'feat-dark-theme-loading-over.hakspring.pages.dev' && window.location.hostname !== 'fix-migration-page-scroll.hakspring.pages.dev') {
     return false;
@@ -252,7 +359,8 @@ const DATA_FILES_TO_CACHE = [
   // 其他資料
   'tone_mapping.json',
   'NAmedias.json',
-  'exclusions.json'
+  'exclusions.json',
+  'trie.json'
 ];
 
 const DB_NAME = 'HakkaDataDB';
@@ -1274,7 +1382,8 @@ function getKeyNameFromPath(filePath) {
     const otherMap = {
         'tone_mapping': 'toneMappingData',
         'NAmedias': 'missingAudioData',
-        'exclusions': '例外音檔'
+        'exclusions': '例外音檔',
+        'trie': 'wordTrie'
     };
     return otherMap[fileName];
 }
@@ -2244,6 +2353,7 @@ function displayQueryResults(results, keyword, searchMode, summaryText, selected
             const sentenceSpan = document.createElement('span');
             sentenceSpan.className = 'sentence';
             sentenceSpan.innerHTML = (highlight.sentence ? line['例句'].replace(highlightRegex, '<mark>$1</mark>') : line['例句']).replace(/\n/g, '<br>');
+            applySegmentationToElement(sentenceSpan); // 在這裡呼叫斷詞
             td3.appendChild(sentenceSpan);
             td3.appendChild(document.createElement('br'));
 
@@ -3348,6 +3458,7 @@ function renderCategoryItems(itemsToRender, dialectInfo, category, isInitialLoad
             const sentenceSpan = document.createElement('span');
             sentenceSpan.className = 'sentence';
             sentenceSpan.innerHTML = line.例句.replace(/"/g, '').replace(/\n/g, '<br>');
+            applySegmentationToElement(sentenceSpan); // 在這裡呼叫斷詞
             td3.appendChild(sentenceSpan);
             td3.appendChild(document.createElement('br'));
             if (dialectInfo.級名 === '高級' || (missingAudioInfo && missingAudioInfo.sentence === false)) {
@@ -4402,6 +4513,35 @@ function handleAutoPlay(autoPlayTargetRowId, dialectInfo, category) {
           if (!isNaN(rowIndex) && g_currentSearchResults[rowIndex]) {
               toggleSearchAccordion(button, g_currentSearchResults[rowIndex]);
           }
+          return; // 處理完畢，返回
+      }
+
+      const segmentedWord = event.target.closest('a.segmented-word');
+      if (segmentedWord) {
+          event.preventDefault();
+          const word = segmentedWord.dataset.word;
+          if (word) {
+              const trElement = segmentedWord.closest('tr');
+              let contextualDialect = null;
+
+              if (trElement) {
+                if (trElement.classList.contains('accordion-row')) {
+                    const dialectClass = Array.from(trElement.classList).find(c => ['四縣', '海陸', '大埔', '饒平', '詔安'].includes(c));
+                    if (dialectClass) {
+                        contextualDialect = dialectClass;
+                    }
+                } else if (trElement.closest('#category-table')) {
+                    if (g_currentDialectInfo && g_currentDialectInfo.腔名) {
+                        contextualDialect = g_currentDialectInfo.腔名;
+                    }
+                } else {
+                    // Fallback for search results
+                    contextualDialect = currentActiveMainDialectName;
+                }
+              }
+
+              showPronunciationPopup(word, null, segmentedWord, null, contextualDialect);
+          }
       }
   });
 }
@@ -4573,6 +4713,7 @@ function createComparisonRow(line, dialectInfo) {
         const sentenceSpan = document.createElement('span');
         sentenceSpan.className = 'sentence';
         sentenceSpan.innerHTML = line.例句.replace(/"/g, '').replace(/\n/g, '<br>');
+        applySegmentationToElement(sentenceSpan); // 在這裡呼叫斷詞
         td3.appendChild(sentenceSpan);
         td3.appendChild(document.createElement('br'));
         if (dialectInfo.級名 === '高級' || (missingAudioInfo && missingAudioInfo.sentence === false)) {