adaptive-ui/classify_papers.py at main · bobdodd/adaptive-ui · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python3
"""
Classify paper reviews as clean vs garbled based on suspicious character count.
"""

import os
import re
from collections import Counter

def is_garbled_text(text):
    """Check if text appears garbled based on suspicious character frequency and patterns."""
    if not text or len(text) < 100:
        return False  # Too short to judge

    # Pattern 1: Check suspicious characters (old font encoding)
    suspicious = set('#%+3615?m248@>;=J')
    total_chars = len(text)
    suspicious_count = sum(1 for c in text if c in suspicious)
    suspicious_ratio = suspicious_count / total_chars if total_chars > 0 else 0

    # If more than 1% suspicious characters, likely garbled
    if suspicious_ratio > 0.01:
        return True

    # Pattern 2: Check for common garbled patterns (old font encoding)
    garbled_patterns = [
        r't#e', r'%orld', r'pop\+lation', r'a1in1', r'n\+m3er',
        r'e5perien6in1', r'f\+n6tional', r'6apa3ility', r'desi1n',
        r'prod\+6ts', r's\+66ess', r'\+\s*sers', r'6orre6t',
        r'lsk\d+Jeng', r'pjc\d+Jeng'  # garbled email patterns
    ]
    for pattern in garbled_patterns:
        if re.search(pattern, text):
            return True

    # Pattern 3: Check for common substitution patterns (e->i, t->e, etc.)
    # These indicate a different font encoding issue
    substitution_patterns = [
        r'systee\b', r'inforeation\b', r'eobile\b', r'coeputer\b',
        r'ieproveeents\b', r'eappings\b', r'coepleted\b', r'eode\b',
        r'graeear\b', r'optieization\b', r'prograeeing\b', r'subsystee\b',
        r'atteept\b', r'developers\b', r'lieitations\b', r'proprietary\b',
        r'toolkits\b', r'operating\b', r'features\b', r'notably\b',
        r'library\b', r'virtual\b', r'eeeory\b', r'prelieinary\b',
        r'vocabulary\b', r'opportunities\b', r'otherwise\b', r'engaged\b',
        r'designed\b', r'typically\b', r'exclusively\b', r'eeploy\b',
        r'sieple\b', r'fore\b', r'candidate\b', r'considerations\b',
        r'trade-offs\b', r'eepirical\b', r'evaluation\b', r'ieproveeents\b',
    ]

    # Count substitution pattern matches
    substitution_count = 0
    for pattern in substitution_patterns:
        if re.search(pattern, text, re.IGNORECASE):
            substitution_count += 1

    # If more than 15 distinct substitution patterns found, likely garbled
    if substitution_count > 15:
        return True

    # Pattern 4: Check for unusual word patterns that suggest garbling
    # Look for words with repeated 'e' -> 'i' substitution
    words = re.findall(r'\b\w{5,}\b', text.lower())
    garbled_word_count = 0
    for word in words:
        # Words ending with 'ee' (should be '...ed' or similar)
        if re.search(r'ee\b', word) and len(word) > 4:
            garbled_word_count += 1
        # Words with 'ie' in middle (common substitution)
        if re.search(r'ie[^aeiou]', word) and len(word) > 4:
            garbled_word_count += 1

    if garbled_word_count > 30:
        return True

    return False

def analyze_reviews(reviews_dir):
    """Analyze all review files and classify them."""
    import glob

    clean_papers = []
    garbled_papers = []
    total_papers = 0

    for review_path in glob.glob(os.path.join(reviews_dir, '*_review.txt')):
        total_papers += 1
        try:
            with open(review_path, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read(5000)  # Read first 5000 chars for analysis

            filename = os.path.basename(review_path)
            stem = filename.replace('_review.txt', '')

            if is_garbled_text(text):
                garbled_papers.append(stem)
            else:
                clean_papers.append(stem)
        except Exception as e:
            print(f"Error reading {review_path}: {e}")
            garbled_papers.append(os.path.basename(review_path).replace('_review.txt', ''))

    # Sort results
    clean_papers.sort()
    garbled_papers.sort()

    return clean_papers, garbled_papers, total_papers

def main():
    reviews_dir = 'paper_reviews'
    clean, garbled, total = analyze_reviews(reviews_dir)

    print(f"Total papers analyzed: {total}")
    print(f"Clean papers: {len(clean)} ({len(clean)/total*100:.1f}%)")
    print(f"Garbled papers: {len(garbled)} ({len(garbled)/total*100:.1f}%)")

    # Write results to files
    with open('clean_papers.txt', 'w', encoding='utf-8') as f:
        f.write("# Clean Papers (readable text)\n")
        f.write("# Generated by classify_papers.py\n\n")
        for paper in clean:
            f.write(f"{paper}\n")

    with open('garbled_papers.txt', 'w', encoding='utf-8') as f:
        f.write("# Garbled Papers (font encoding issues)\n")
        f.write("# Generated by classify_papers.py\n\n")
        for paper in garbled:
            f.write(f"{paper}\n")

    print(f"\nClean papers list saved to clean_papers.txt")
    print(f"Garbled papers list saved to garbled_papers.txt")

    # Show some examples
    print(f"\nFirst 10 clean papers:")
    for i, paper in enumerate(clean[:10]):
        print(f"  {i+1}. {paper}")

    print(f"\nFirst 10 garbled papers:")
    for i, paper in enumerate(garbled[:10]):
        print(f"  {i+1}. {paper}")

if __name__ == '__main__':
    main()