-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocument_processor.py
More file actions
148 lines (122 loc) · 5.34 KB
/
document_processor.py
File metadata and controls
148 lines (122 loc) · 5.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import io
import docx
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import Pt, Inches
def extract_content_from_docx(file_path):
"""
Extract content from a Word document organized by headings.
This function can handle both:
1. Standard Word heading styles (Heading 1, Heading 2, etc.)
2. Markdown-style headings (# Heading, ## Subheading, etc.)
Args:
file_path (str): Path to the Word document
Returns:
list: A list of dictionaries containing heading level, heading text, and content
"""
try:
doc = docx.Document(file_path)
document_structure = []
current_heading = {"level": 1, "heading": "Untitled", "content": ""}
current_content = []
for para in doc.paragraphs:
text = para.text.strip()
# Check if paragraph is a Word heading style
if para.style and para.style.name and para.style.name.startswith('Heading'):
# Save the previous section regardless of content
if current_heading:
current_heading["content"] = "\n".join(current_content)
document_structure.append(current_heading)
current_content = []
# Start a new section
# Get the last character and convert to int, default to 1 if conversion fails
try:
heading_level = int(para.style.name[-1]) # Extract the number from "Heading X"
except (IndexError, ValueError):
heading_level = 1 # Default to heading level 1 if extraction fails
current_heading = {
"level": heading_level,
"heading": text,
"content": ""
}
# Check if paragraph is markdown-style heading (# Heading)
elif text.startswith('#'):
# Count the number of # to determine heading level
level = 0
for char in text:
if char == '#':
level += 1
else:
break
# Only process if it's a valid heading (has content after the #)
if level > 0 and level <= 6 and len(text) > level and text[level] == ' ':
# Save the previous section regardless of content
if current_heading:
current_heading["content"] = "\n".join(current_content)
document_structure.append(current_heading)
current_content = []
# Extract the heading text without the # symbols
heading_text = text[level:].strip()
# Start a new section
current_heading = {
"level": level,
"heading": heading_text,
"content": ""
}
else:
# This is not a heading, just regular content with # in it
if text:
current_content.append(text)
else:
# Add paragraph to current content if it's not empty
if text:
current_content.append(text)
# Add the last section regardless of content
if current_heading:
current_heading["content"] = "\n".join(current_content)
document_structure.append(current_heading)
return document_structure
except Exception as e:
print(f"Error extracting content from docx: {str(e)}")
return []
def create_newsletter_docx(document_structure):
"""
Create a newsletter Word document from processed content.
Args:
document_structure (list): A list of dictionaries containing heading level,
heading text, and processed content
Returns:
bytes: The Word document as bytes
"""
doc = docx.Document()
# Set page margins
sections = doc.sections
for section in sections:
section.top_margin = Inches(1)
section.bottom_margin = Inches(1)
section.left_margin = Inches(1)
section.right_margin = Inches(1)
# Add a title
title = doc.add_heading("Internal Newsletter", 0)
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
# Add a horizontal line
doc.add_paragraph().add_run("_" * 50).bold = True
# Process each section
for item in document_structure:
heading_level = item["level"]
heading_text = item["heading"]
content = item.get("processed_content", item["content"]) # Use processed content if available
# Add the heading
doc.add_heading(heading_text, heading_level)
# Add the content
paragraphs = content.split('\n')
for para_text in paragraphs:
if para_text.strip():
p = doc.add_paragraph()
p.add_run(para_text)
# Add some space after each section
doc.add_paragraph()
# Save the document to a bytes buffer
buffer = io.BytesIO()
doc.save(buffer)
buffer.seek(0)
return buffer.getvalue()