Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 114 additions & 0 deletions build_trie.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# -*- coding: utf-8 -*-
import os
import csv
import json
import re

def get_words_from_cert_csv(file_path):
"""從 CERT CSV 檔案中提取詞彙。"""
words = set()
with open(file_path, 'r', encoding='utf-8-sig') as f:
reader = csv.reader(f)
try:
header = next(reader)
except StopIteration:
return set() # 跳過空檔案

hakka_word_index = -1
# 尋找包含 "客家語" 的欄位索引
for i, col in enumerate(header):
if "客家語" in col:
hakka_word_index = i
break

if hakka_word_index == -1:
# print(f" - 警告: 在 {os.path.basename(file_path)} 中找不到 '客家語' 欄位。")
return set()

for row in reader:
if len(row) > hakka_word_index:
word = row[hakka_word_index].strip()
if word:
# 有些詞條會用 / 分隔,例如 "人客/客人"
for w in re.split(r'[/\s]', word):
if w:
words.add(w)
return words


def get_words_from_gip_csv(file_path):
"""從 GIP CSV 檔案中提取詞彙。"""
words = set()
with open(file_path, 'r', encoding='utf-8-sig') as f:
try:
reader = csv.DictReader(f)
for row in reader:
word = row.get('詞目', '').strip()
if word:
# 有些詞條會用 / 分隔
for w in re.split(r'[/\s]', word):
if w:
words.add(w)
except Exception:
# 檔案可能是空的或標頭有問題
return set()
return words

def build_trie(words):
"""從一組詞彙建立 Trie 樹。"""
root = {}
for word in words:
node = root
for char in word:
node = node.setdefault(char, {})
node['is_end'] = True
return root

def main():
"""主函式,處理檔案並建立 Trie。"""
all_words = set()
script_dir = os.path.dirname(os.path.abspath(__file__))
if script_dir == "/Users/jules/workspace":
script_dir = "." # 在本地執行的路徑修正

cert_dir = os.path.join(script_dir, 'data', 'cert')
gip_dir = os.path.join(script_dir, 'data', 'gip')

print("--- 開始從 CERT 檔案提取詞彙 ---")
if os.path.isdir(cert_dir):
for filename in sorted(os.listdir(cert_dir)):
if filename.endswith('.csv'):
file_path = os.path.join(cert_dir, filename)
# print(f"> 讀取中: {filename}")
words = get_words_from_cert_csv(file_path)
all_words.update(words)
else:
print(f" ✗ 錯誤: 找不到目錄 '{cert_dir}'")


print("--- 開始從 GIP 檔案提取詞彙 ---")
if os.path.isdir(gip_dir):
for filename in sorted(os.listdir(gip_dir)):
if filename.endswith('.csv'):
file_path = os.path.join(gip_dir, filename)
# print(f"> 讀取中: {filename}")
words = get_words_from_gip_csv(file_path)
all_words.update(words)
else:
print(f" ✗ 錯誤: 找不到目錄 '{gip_dir}'")

print(f"--- 總共找到 {len(all_words)} 個不重複的詞彙 ---")

print("--- 正在建立詞彙樹 (Trie) ---")
trie = build_trie(all_words)

output_path = os.path.join(script_dir, 'trie.json')
print(f"--- 正在將詞彙樹寫入到 {output_path} ---")
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(trie, f, ensure_ascii=False)

print("--- 全部處理完成 ---")


if __name__ == '__main__':
main()
73 changes: 73 additions & 0 deletions jules-scratch/verification/verify_segmentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import asyncio
import subprocess
import time
from playwright.async_api import async_playwright, expect

async def main():
# 啟動一個簡單的 HTTP 伺服器
server_process = subprocess.Popen(
['python3', '-m', 'http.server'],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL
)
time.sleep(1) # 等待伺服器啟動

try:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()

# 1. 導航到頁面
await page.goto('http://localhost:8000/index.html', timeout=60000)

# 2. 等待 App 初始化完成
await expect(page.locator("#loading-indicator")).to_be_hidden(timeout=60000)

# 3. 關閉所有彈出視窗 (以正確的順序)
whats_new_modal = page.locator('#whatsNewModal')
if await whats_new_modal.is_visible():
await page.locator('#whatsNewModalCloseBtn').click()
await expect(whats_new_modal).to_be_hidden()

info_modal = page.locator('#infoModal')
if await info_modal.is_visible():
await page.locator('#infoModalCloseBtn').click()
await expect(info_modal).to_be_hidden()

# 4. 點擊方言連結
dialect_link = page.locator('span[data-varname="四基"] a')
await expect(dialect_link).to_be_visible(timeout=10000)
await dialect_link.click()

# 5. **最終修正**: 等待新的分類面板被渲染出來
await expect(page.locator("#cat-panel", has_text="再擇類別:")).to_be_visible(timeout=10000)

# 6. 現在可以安全地點擊分類了
category_radio = page.locator('input[name="category"][value="數字、時間"]')
await category_radio.click()

# 7. 等待表格內容出現
await expect(page.locator("#category-table tbody tr")).to_be_visible()

# 8. 找到並點擊第一個斷詞連結
first_segmented_word = page.locator('a.segmented-word').first
await expect(first_segmented_word).to_be_visible()
await first_segmented_word.click()

# 9. 等待並驗證查詞 popup
popup = page.locator('#selectionPopup')
await expect(popup).to_be_visible()
word_text = await first_segmented_word.text_content()
await expect(popup.locator('#selectionPopupTitle')).to_have_text(f'尋「{word_text}」个讀音')

# 10. 截圖
await page.screenshot(path="jules-scratch/verification/verification.png")

await browser.close()
finally:
# 確保伺服器進程被終止
server_process.terminate()
server_process.wait()

if __name__ == '__main__':
asyncio.run(main())
145 changes: 143 additions & 2 deletions main.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,110 @@
/**
* 使用 Trie 樹對句子進行最大長度匹配斷詞。
* @param {string} text - 需要斷詞的純文字句子。
* @param {object} trie - 預載入的詞彙樹。
* @returns {Array<object>} - 回傳一個物件陣列,每個物件包含 'text' 和 'isWord' 屬性。
*/
function segmentSentenceWithTrie(text, trie) {
if (!text || !trie) return [{ text: text, isWord: false }];

const results = [];
let currentIndex = 0;

while (currentIndex < text.length) {
let longestMatch = '';
let lastFoundIndex = currentIndex;

let currentNode = trie;
for (let i = currentIndex; i < text.length; i++) {
const char = text[i];
if (currentNode[char]) {
currentNode = currentNode[char];
if (currentNode.is_end) {
longestMatch = text.substring(currentIndex, i + 1);
lastFoundIndex = i;
}
} else {
break;
}
}

if (longestMatch) {
results.push({ text: longestMatch, isWord: true });
currentIndex = lastFoundIndex + 1;
} else {
// 如果在目前位置找不到任何詞,就將這個字當作單一非詞字元處理
const unmatchableChar = text[currentIndex];
// 尋找連續的非詞字元並將它們合併
let nonWordEndIndex = currentIndex + 1;
while (nonWordEndIndex < text.length) {
const nextChar = text[nonWordEndIndex];
let nextNode = trie;
let isNextCharStartOfWord = false;
if (nextNode[nextChar]) {
isNextCharStartOfWord = true;
}

if (isNextCharStartOfWord) {
break;
}
nonWordEndIndex++;
}
const nonWordText = text.substring(currentIndex, nonWordEndIndex);
results.push({ text: nonWordText, isWord: false });
currentIndex = nonWordEndIndex;
}
}
return results;
}


/**
* 將斷詞結果應用到指定的 DOM 元素上,將詞彙轉換為可點擊的連結。
* @param {HTMLElement} element - 要處理的元素,例如包含例句的 <span> 或 <td>。
*/
function applySegmentationToElement(element) {
if (!element || !element.textContent || !window.wordTrie) {
return;
}

// 處理 <br> 標籤,將其暫時替換為特殊的分隔符號
const originalHtml = element.innerHTML;
const placeholder = '||BR||';
const textSegments = originalHtml.split(/<br\s*\/?>/i);

let finalHtml = '';

textSegments.forEach((segment, index) => {
// 為了避免重複處理,先建立一個臨時的 div 來取得純文字
const tempDiv = document.createElement('div');
tempDiv.innerHTML = segment;
const plainText = tempDiv.textContent || tempDiv.innerText || '';

if (plainText) {
const segmentedParts = segmentSentenceWithTrie(plainText, window.wordTrie);
let segmentHtml = '';
segmentedParts.forEach(part => {
if (part.isWord) {
segmentHtml += `<a href="#" class="segmented-word" data-word="${part.text}">${part.text}</a>`;
} else {
// 對非詞部分進行 HTML 編碼,避免 XSS 風險
const encodedText = part.text.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;').replace(/"/g, '&quot;');
segmentHtml += encodedText;
}
});
finalHtml += segmentHtml;
}

// 在每段之間加回 <br>
if (index < textSegments.length - 1) {
finalHtml += '<br>';
}
});

element.innerHTML = finalHtml;
}


function handleDomainMigration() {
if (window.location.hostname !== 'aiuanyu.github.io' && window.location.hostname !== 'fix-migration-dark-theme.hakspring.pages.dev' && window.location.hostname !== 'feat-dark-theme-loading-over.hakspring.pages.dev' && window.location.hostname !== 'fix-migration-page-scroll.hakspring.pages.dev') {
return false;
Expand Down Expand Up @@ -252,7 +359,8 @@ const DATA_FILES_TO_CACHE = [
// 其他資料
'tone_mapping.json',
'NAmedias.json',
'exclusions.json'
'exclusions.json',
'trie.json'
];

const DB_NAME = 'HakkaDataDB';
Expand Down Expand Up @@ -1274,7 +1382,8 @@ function getKeyNameFromPath(filePath) {
const otherMap = {
'tone_mapping': 'toneMappingData',
'NAmedias': 'missingAudioData',
'exclusions': '例外音檔'
'exclusions': '例外音檔',
'trie': 'wordTrie'
};
return otherMap[fileName];
}
Expand Down Expand Up @@ -2244,6 +2353,7 @@ function displayQueryResults(results, keyword, searchMode, summaryText, selected
const sentenceSpan = document.createElement('span');
sentenceSpan.className = 'sentence';
sentenceSpan.innerHTML = (highlight.sentence ? line['例句'].replace(highlightRegex, '<mark>$1</mark>') : line['例句']).replace(/\n/g, '<br>');
applySegmentationToElement(sentenceSpan); // 在這裡呼叫斷詞
td3.appendChild(sentenceSpan);
td3.appendChild(document.createElement('br'));

Expand Down Expand Up @@ -3348,6 +3458,7 @@ function renderCategoryItems(itemsToRender, dialectInfo, category, isInitialLoad
const sentenceSpan = document.createElement('span');
sentenceSpan.className = 'sentence';
sentenceSpan.innerHTML = line.例句.replace(/"/g, '').replace(/\n/g, '<br>');
applySegmentationToElement(sentenceSpan); // 在這裡呼叫斷詞
td3.appendChild(sentenceSpan);
td3.appendChild(document.createElement('br'));
if (dialectInfo.級名 === '高級' || (missingAudioInfo && missingAudioInfo.sentence === false)) {
Expand Down Expand Up @@ -4402,6 +4513,35 @@ function handleAutoPlay(autoPlayTargetRowId, dialectInfo, category) {
if (!isNaN(rowIndex) && g_currentSearchResults[rowIndex]) {
toggleSearchAccordion(button, g_currentSearchResults[rowIndex]);
}
return; // 處理完畢,返回
}

const segmentedWord = event.target.closest('a.segmented-word');
if (segmentedWord) {
event.preventDefault();
const word = segmentedWord.dataset.word;
if (word) {
const trElement = segmentedWord.closest('tr');
let contextualDialect = null;

if (trElement) {
if (trElement.classList.contains('accordion-row')) {
const dialectClass = Array.from(trElement.classList).find(c => ['四縣', '海陸', '大埔', '饒平', '詔安'].includes(c));
if (dialectClass) {
contextualDialect = dialectClass;
}
} else if (trElement.closest('#category-table')) {
if (g_currentDialectInfo && g_currentDialectInfo.腔名) {
contextualDialect = g_currentDialectInfo.腔名;
}
} else {
// Fallback for search results
contextualDialect = currentActiveMainDialectName;
}
}

showPronunciationPopup(word, null, segmentedWord, null, contextualDialect);
}
}
});
}
Expand Down Expand Up @@ -4573,6 +4713,7 @@ function createComparisonRow(line, dialectInfo) {
const sentenceSpan = document.createElement('span');
sentenceSpan.className = 'sentence';
sentenceSpan.innerHTML = line.例句.replace(/"/g, '').replace(/\n/g, '<br>');
applySegmentationToElement(sentenceSpan); // 在這裡呼叫斷詞
td3.appendChild(sentenceSpan);
td3.appendChild(document.createElement('br'));
if (dialectInfo.級名 === '高級' || (missingAudioInfo && missingAudioInfo.sentence === false)) {
Expand Down
Loading