-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathadd_bibtex.py
More file actions
176 lines (140 loc) · 6.08 KB
/
add_bibtex.py
File metadata and controls
176 lines (140 loc) · 6.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/env python3
"""
Script to add BibTeX to publications in index.html from publications_complete.json
"""
import json
import re
from pathlib import Path
from html import escape
# File paths
json_path = Path("/Users/kayhan/Documents/Projects/newWebSite/publications_complete.json")
html_path = Path("/Users/kayhan/Documents/Projects/newWebSite/index.html")
# Load JSON data
with open(json_path, 'r', encoding='utf-8') as f:
publications_data = json.load(f)
# Read HTML
with open(html_path, 'r', encoding='utf-8') as f:
html_content = f.read()
def normalize_title(title):
"""Normalize title for matching"""
# Remove HTML entities and tags
title = re.sub(r'<[^>]+>', '', title)
title = title.replace('&', '&').replace('<', '<').replace('>', '>')
title = re.sub(r'\s+', ' ', title.strip())
return title.lower()
def find_publication_in_json(title, year):
"""Find matching publication in JSON data"""
year_str = str(year)
if year_str not in publications_data:
return None
normalized_title = normalize_title(title)
# First try exact match
for pub in publications_data[year_str]:
if normalize_title(pub.get('title', '')) == normalized_title:
return pub
# Try fuzzy matching
title_words = set(normalized_title.split())
if len(title_words) < 3:
return None
best_match = None
best_score = 0
for pub in publications_data[year_str]:
pub_title_norm = normalize_title(pub.get('title', ''))
pub_words = set(pub_title_norm.split())
if len(pub_words) == 0:
continue
overlap = len(title_words & pub_words)
total_words = min(len(title_words), len(pub_words))
if total_words > 0:
score = overlap / total_words
if score > best_score and score > 0.6:
best_score = score
best_match = pub
return best_match
def format_bibtex_html(bibtex_text):
"""Format BibTeX text as HTML with proper escaping"""
# Replace \n with actual newlines
bibtex_text = bibtex_text.replace('\\n', '\n')
# Escape HTML special characters
bibtex_escaped = escape(bibtex_text)
# Generate unique ID for this BibTeX section
import random
bibtex_id = f"bibtex_{random.randint(10000, 99999)}"
return f'''<div class="bibtex-section mt-2">
<button class="btn btn-sm btn-outline-secondary mb-2" type="button" onclick="toggleBibtex('{bibtex_id}')">
Show BibTeX
</button>
<pre id="{bibtex_id}" style="display: none; background: var(--bg-soft); padding: 12px; border-radius: 8px; border: 1px solid var(--card-border); font-size: 0.85rem; overflow-x: auto; white-space: pre-wrap; word-wrap: break-word;"><code>{bibtex_escaped}</code></pre>
</div>'''
# Find all year sections
year_sections = list(re.finditer(r'<h4 class="mt-4 mb-3 fw-bold">(\d{4})</h4>', html_content))
if not year_sections:
print("No year sections found!")
exit(1)
result_content = html_content
# Process from end to start to avoid position shifting
for i in range(len(year_sections) - 1, -1, -1):
year_match = year_sections[i]
year = year_match.group(1)
year_start = year_match.start()
# Find the end of this year's section
if i < len(year_sections) - 1:
section_end = year_sections[i + 1].start()
else:
section_end_match = re.search(r'</section>', result_content[year_start:])
if section_end_match:
section_end = year_start + section_end_match.start()
else:
section_end = len(result_content)
year_section = result_content[year_start:section_end]
# Process articles in this year
# Pattern to match article with meta-links
article_pattern = r'(<article class="item-row">.*?<h5 class="mb-1 fw-bold">(.*?)</h5>.*?<div class="meta-links mb-2">.*?</div>)(.*?)(</article>)'
def add_bibtex_to_article(match):
before_end = match.group(1)
article_middle = match.group(3)
article_end = match.group(4)
# Extract title
title_match = re.search(r'<h5 class="mb-1 fw-bold">(.*?)</h5>', before_end)
if not title_match:
return match.group(0)
title = title_match.group(1)
title_clean = re.sub(r'<[^>]+>', '', title).strip()
# Find matching publication
pub_data = find_publication_in_json(title_clean, year)
if pub_data and pub_data.get('bibtex'):
bibtex_html = format_bibtex_html(pub_data['bibtex'])
# Check if BibTeX already exists
if 'bibtex-section' not in article_middle:
# Add BibTeX after meta-links, before article end
return before_end + '\n ' + bibtex_html + article_middle + article_end
return match.group(0)
# Replace articles in this section
updated_section = re.sub(article_pattern, add_bibtex_to_article, year_section, flags=re.DOTALL)
# Replace the section in the main content
result_content = result_content[:year_start] + updated_section + result_content[section_end:]
# Add JavaScript function for toggling BibTeX if not present
if 'function toggleBibtex' not in result_content:
# Find the closing script tag or add before </body>
if '</body>' in result_content:
toggle_script = '''
<script>
function toggleBibtex(id) {
const element = document.getElementById(id);
const button = element.previousElementSibling;
if (element.style.display === 'none') {
element.style.display = 'block';
button.textContent = 'Hide BibTeX';
} else {
element.style.display = 'none';
button.textContent = 'Show BibTeX';
}
}
</script>
'''
result_content = result_content.replace('</body>', toggle_script + '</body>')
# Write updated HTML
with open(html_path, 'w', encoding='utf-8') as f:
f.write(result_content)
print(f"Added BibTeX to publications in {html_path}")
print("Please review the changes.")