NewsDraft/document_processor.py at ToDeploy · lCaptNemol/NewsDraft · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import io
import docx
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import Pt, Inches

def extract_content_from_docx(file_path):
    """
    Extract content from a Word document organized by headings.
    This function can handle both:
    1. Standard Word heading styles (Heading 1, Heading 2, etc.)
    2. Markdown-style headings (# Heading, ## Subheading, etc.)

    Args:
        file_path (str): Path to the Word document

    Returns:
        list: A list of dictionaries containing heading level, heading text, and content
    """
    try:
        doc = docx.Document(file_path)
        document_structure = []
        current_heading = {"level": 1, "heading": "Untitled", "content": ""}
        current_content = []

        for para in doc.paragraphs:
            text = para.text.strip()

            # Check if paragraph is a Word heading style
            if para.style and para.style.name and para.style.name.startswith('Heading'):
                # Save the previous section regardless of content
                if current_heading:
                    current_heading["content"] = "\n".join(current_content)
                    document_structure.append(current_heading)
                    current_content = []

                # Start a new section
                # Get the last character and convert to int, default to 1 if conversion fails
                try:
                    heading_level = int(para.style.name[-1])  # Extract the number from "Heading X"
                except (IndexError, ValueError):
                    heading_level = 1  # Default to heading level 1 if extraction fails
                current_heading = {
                    "level": heading_level,
                    "heading": text,
                    "content": ""
                }

            # Check if paragraph is markdown-style heading (# Heading)
            elif text.startswith('#'):
                # Count the number of # to determine heading level
                level = 0
                for char in text:
                    if char == '#':
                        level += 1
                    else:
                        break

                # Only process if it's a valid heading (has content after the #)
                if level > 0 and level <= 6 and len(text) > level and text[level] == ' ':
                    # Save the previous section regardless of content
                    if current_heading:
                        current_heading["content"] = "\n".join(current_content)
                        document_structure.append(current_heading)
                        current_content = []

                    # Extract the heading text without the # symbols
                    heading_text = text[level:].strip()

                    # Start a new section
                    current_heading = {
                        "level": level,
                        "heading": heading_text,
                        "content": ""
                    }
                else:
                    # This is not a heading, just regular content with # in it
                    if text:
                        current_content.append(text)
            else:
                # Add paragraph to current content if it's not empty
                if text:
                    current_content.append(text)

        # Add the last section regardless of content
        if current_heading:
            current_heading["content"] = "\n".join(current_content)
            document_structure.append(current_heading)

        return document_structure

    except Exception as e:
        print(f"Error extracting content from docx: {str(e)}")
        return []


def create_newsletter_docx(document_structure):
    """
    Create a newsletter Word document from processed content.

    Args:
        document_structure (list): A list of dictionaries containing heading level,
                                  heading text, and processed content

    Returns:
        bytes: The Word document as bytes
    """
    doc = docx.Document()

    # Set page margins
    sections = doc.sections
    for section in sections:
        section.top_margin = Inches(1)
        section.bottom_margin = Inches(1)
        section.left_margin = Inches(1)
        section.right_margin = Inches(1)

    # Add a title
    title = doc.add_heading("Internal Newsletter", 0)
    title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER

    # Add a horizontal line
    doc.add_paragraph().add_run("_" * 50).bold = True

    # Process each section
    for item in document_structure:
        heading_level = item["level"]
        heading_text = item["heading"]
        content = item.get("processed_content", item["content"])  # Use processed content if available

        # Add the heading
        doc.add_heading(heading_text, heading_level)

        # Add the content
        paragraphs = content.split('\n')
        for para_text in paragraphs:
            if para_text.strip():
                p = doc.add_paragraph()
                p.add_run(para_text)

        # Add some space after each section
        doc.add_paragraph()

    # Save the document to a bytes buffer
    buffer = io.BytesIO()
    doc.save(buffer)
    buffer.seek(0)

    return buffer.getvalue()