-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclassify_papers.py
More file actions
143 lines (117 loc) · 5.14 KB
/
classify_papers.py
File metadata and controls
143 lines (117 loc) · 5.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python3
"""
Classify paper reviews as clean vs garbled based on suspicious character count.
"""
import os
import re
from collections import Counter
def is_garbled_text(text):
"""Check if text appears garbled based on suspicious character frequency and patterns."""
if not text or len(text) < 100:
return False # Too short to judge
# Pattern 1: Check suspicious characters (old font encoding)
suspicious = set('#%+3615?m248@>;=J')
total_chars = len(text)
suspicious_count = sum(1 for c in text if c in suspicious)
suspicious_ratio = suspicious_count / total_chars if total_chars > 0 else 0
# If more than 1% suspicious characters, likely garbled
if suspicious_ratio > 0.01:
return True
# Pattern 2: Check for common garbled patterns (old font encoding)
garbled_patterns = [
r't#e', r'%orld', r'pop\+lation', r'a1in1', r'n\+m3er',
r'e5perien6in1', r'f\+n6tional', r'6apa3ility', r'desi1n',
r'prod\+6ts', r's\+66ess', r'\+\s*sers', r'6orre6t',
r'lsk\d+Jeng', r'pjc\d+Jeng' # garbled email patterns
]
for pattern in garbled_patterns:
if re.search(pattern, text):
return True
# Pattern 3: Check for common substitution patterns (e->i, t->e, etc.)
# These indicate a different font encoding issue
substitution_patterns = [
r'systee\b', r'inforeation\b', r'eobile\b', r'coeputer\b',
r'ieproveeents\b', r'eappings\b', r'coepleted\b', r'eode\b',
r'graeear\b', r'optieization\b', r'prograeeing\b', r'subsystee\b',
r'atteept\b', r'developers\b', r'lieitations\b', r'proprietary\b',
r'toolkits\b', r'operating\b', r'features\b', r'notably\b',
r'library\b', r'virtual\b', r'eeeory\b', r'prelieinary\b',
r'vocabulary\b', r'opportunities\b', r'otherwise\b', r'engaged\b',
r'designed\b', r'typically\b', r'exclusively\b', r'eeploy\b',
r'sieple\b', r'fore\b', r'candidate\b', r'considerations\b',
r'trade-offs\b', r'eepirical\b', r'evaluation\b', r'ieproveeents\b',
]
# Count substitution pattern matches
substitution_count = 0
for pattern in substitution_patterns:
if re.search(pattern, text, re.IGNORECASE):
substitution_count += 1
# If more than 15 distinct substitution patterns found, likely garbled
if substitution_count > 15:
return True
# Pattern 4: Check for unusual word patterns that suggest garbling
# Look for words with repeated 'e' -> 'i' substitution
words = re.findall(r'\b\w{5,}\b', text.lower())
garbled_word_count = 0
for word in words:
# Words ending with 'ee' (should be '...ed' or similar)
if re.search(r'ee\b', word) and len(word) > 4:
garbled_word_count += 1
# Words with 'ie' in middle (common substitution)
if re.search(r'ie[^aeiou]', word) and len(word) > 4:
garbled_word_count += 1
if garbled_word_count > 30:
return True
return False
def analyze_reviews(reviews_dir):
"""Analyze all review files and classify them."""
import glob
clean_papers = []
garbled_papers = []
total_papers = 0
for review_path in glob.glob(os.path.join(reviews_dir, '*_review.txt')):
total_papers += 1
try:
with open(review_path, 'r', encoding='utf-8', errors='ignore') as f:
text = f.read(5000) # Read first 5000 chars for analysis
filename = os.path.basename(review_path)
stem = filename.replace('_review.txt', '')
if is_garbled_text(text):
garbled_papers.append(stem)
else:
clean_papers.append(stem)
except Exception as e:
print(f"Error reading {review_path}: {e}")
garbled_papers.append(os.path.basename(review_path).replace('_review.txt', ''))
# Sort results
clean_papers.sort()
garbled_papers.sort()
return clean_papers, garbled_papers, total_papers
def main():
reviews_dir = 'paper_reviews'
clean, garbled, total = analyze_reviews(reviews_dir)
print(f"Total papers analyzed: {total}")
print(f"Clean papers: {len(clean)} ({len(clean)/total*100:.1f}%)")
print(f"Garbled papers: {len(garbled)} ({len(garbled)/total*100:.1f}%)")
# Write results to files
with open('clean_papers.txt', 'w', encoding='utf-8') as f:
f.write("# Clean Papers (readable text)\n")
f.write("# Generated by classify_papers.py\n\n")
for paper in clean:
f.write(f"{paper}\n")
with open('garbled_papers.txt', 'w', encoding='utf-8') as f:
f.write("# Garbled Papers (font encoding issues)\n")
f.write("# Generated by classify_papers.py\n\n")
for paper in garbled:
f.write(f"{paper}\n")
print(f"\nClean papers list saved to clean_papers.txt")
print(f"Garbled papers list saved to garbled_papers.txt")
# Show some examples
print(f"\nFirst 10 clean papers:")
for i, paper in enumerate(clean[:10]):
print(f" {i+1}. {paper}")
print(f"\nFirst 10 garbled papers:")
for i, paper in enumerate(garbled[:10]):
print(f" {i+1}. {paper}")
if __name__ == '__main__':
main()