Initial commit

3 months ago · c3dae45f86
5 changed files with 315 additions and 0 deletions
--- a/Limes.pdf
+++ b/Limes.pdf
--- a/Limes.pdf
+++ b/Limes.pdf
--- a/extract_pdf.py
+++ b/extract_pdf.py
@ -0,0 +1,240 @@
+import fitz
+import base64
+import os
+import sys
+import re
+
+def extract_pdf_to_html(pdf_path, output_path):
+    if not os.path.exists(pdf_path):
+        print(f"Error: File {pdf_path} not found.")
+        return
+
+    doc = fitz.open(pdf_path)
+
+    html_content = """<!DOCTYPE html>
+<html lang="it">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Articolo Limes</title>
+    <style>
+        body {
+            font-family: 'Georgia', serif;
+            line-height: 1.6;
+            color: #333;
+            max-width: 800px;
+            margin: 0 auto;
+            padding: 20px;
+            background-color: #f4f4f4;
+        }
+        .article-container {
+            background-color: #fff;
+            padding: 40px;
+            box-shadow: 0 0 10px rgba(0,0,0,0.1);
+        }
+        h1 {
+            color: #b30000;
+            font-size: 2.2em;
+            margin-bottom: 10px;
+            line-height: 1.2;
+            text-transform: uppercase;
+        }
+        .subtitle {
+            font-size: 1.2em;
+            color: #555;
+            margin-bottom: 20px;
+            font-style: italic;
+        }
+        p {
+            margin-bottom: 15px;
+            text-align: justify;
+        }
+        img {
+            max-width: 100%;
+            height: auto;
+            display: block;
+            margin: 20px auto;
+        }
+        .caption {
+            font-size: 0.9em;
+            color: #666;
+            text-align: center;
+            margin-top: -10px;
+            margin-bottom: 20px;
+            font-style: italic;
+        }
+        strong {
+            color: #b30000;
+        }
+    </style>
+</head>
+<body>
+    <div class="article-container">
+"""
+
+    exclude_keywords = [
+        "MENU", "CERCA", "CHI SIAMO", "LA SCUOLA DI LIMES", "VOLUMI", "LEGGI PDF",
+        "I NOSTRI TEMI", "CARTE", "VIDEO", "PODCAST", "EVENTI", "MATBEN88",
+        "IN EDICOLA", "ABBONATI", "Privacy", "Cookie Policy", "Gestione cookie",
+        "Riserva TDM", "Dichiarazione di accessibilità", "GEDI Periodici",
+        "Tutti a lezione di pace", "Il mito dell'Urbe", "IL PUNTO", "CARTA INEDITA",
+        "MENU CERCA", "RUBRICHE", "IL MONDO OGGI", "EBOOK", "AUTORI", "ARGOMENTI", "PROTAGONISTI", "FAQ"
+    ]
+
+    def clean_text(text):
+        # Fix ligatures and other common issues
+        replacements = {
+            "": "fi", # specific to the issue seen, might be fl or fi. "conitto" -> conflitto (fl)
+            "ﬁ": "fi",
+            "ﬂ": "fl",
+            "’": "'",
+            "“": '"',
+            "”": '"',
+        }
+        # Check for the specific char seen in output
+        # In the output it was "conitto". 
+        # If I can't paste the char easily, I might need to handle it by context or unicode.
+        # The char in the output looked like a box or special char.
+        # Let's assume it's a mapping issue.
+        # "conitto" -> "conflitto". So  is fl.
+        
+        for k, v in replacements.items():
+            text = text.replace(k, v)
+        
+        # Generic ligature fix if they appear as special chars
+        # But without knowing the exact unicode, it's hard.
+        # Let's try to fix "con itto" if it appears with a space or weird char.
+        text = text.replace("conitto", "conflitto")
+        
+        return text
+
+    title_found = False
+    
+    # Buffer for text to handle hyphenation across blocks/pages if needed (though blocks are usually self-contained paragraphs)
+    # Actually, let's just process blocks.
+
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        page_dict = page.get_text("dict")
+
+        # Sort blocks by vertical position
+        blocks = sorted(page_dict["blocks"], key=lambda b: b["bbox"][1])
+
+        for block in blocks:
+            bbox = block["bbox"]
+
+            # 1. Header/Footer filtering
+            if bbox[1] < 30 or bbox[3] > 815:
+                continue
+
+            # 2. Sidebar filtering (sidebar is on the right)
+            if bbox[0] > 430:
+                continue
+
+            if block["type"] == 0:  # Text block
+                # ... (text extraction logic)
+                text = ""
+                lines_text = []
+                for line in block["lines"]:
+                    line_str = "".join([span["text"] for span in line["spans"]])
+                    lines_text.append(line_str)
+                
+                full_text = ""
+                for i, line in enumerate(lines_text):
+                    if line.endswith("-") and i < len(lines_text) - 1:
+                        full_text += line[:-1]
+                    else:
+                        full_text += line + " "
+                
+                text = full_text.strip()
+                text = clean_text(text)
+
+                if not text:
+                    continue
+                
+                # Debug print
+                print(f"Processing block: {text[:30]}... bbox={bbox}")
+
+                # 3. Keyword filtering
+                # Check if any keyword is in the text
+                # We use a more aggressive filter for uppercase or short text
+                if any(kw in text for kw in exclude_keywords):
+                    if text.isupper() or len(text) < 100 or "Privacy" in text or "Cookie" in text:
+                        print(f"Excluded junk: {text[:30]}...")
+                        continue
+
+                # 4. Title detection and cleaning
+                if not title_found and "Home" in text:
+                    # Capture the full uppercase title
+                    # Adjusted regex to be a bit more flexible
+                    match = re.search(r"Home(?:[\w\s]+?)([A-Z\s,:'\"!?.—–-]{10,})", text)
+                    if match:
+                        title = match.group(1).strip()
+                        remaining = text[match.end():]
+
+                        # Fix for title grabbing the first letter of subtitle
+                        if title and remaining and title[-1].isupper() and remaining[0].islower():
+                             # Check if the last word of title is just 1 char
+                             last_word = title.split()[-1]
+                             # If it's a single letter and not a common vowel word (A, I, O, E, U, Y), assume it belongs to subtitle
+                             if len(last_word) == 1 and last_word not in ["A", "I", "E", "O", "U"]:
+                                 char_to_move = title[-1]
+                                 title = title[:-1].strip()
+                                 remaining = char_to_move + remaining
+
+                        html_content += f"        <h1>{title}</h1>\n"
+
+                        # Try to extract subtitle
+                        subtitle_match = re.search(r"^([^.!?]+[.!?])", remaining)
+                        if subtitle_match:
+                            subtitle = subtitle_match.group(1).strip()
+                            html_content += f"        <div class='subtitle'>{subtitle}</div>\n"
+
+                        title_found = True
+                        continue
+                    else:
+                         # Fallback if regex fails but "Home" is there, maybe the title is just the rest
+                         # Or maybe it's not the title block.
+                         pass
+
+                # Formatting
+                if len(text) < 150 and text.isupper() and not title_found and len(text) > 10:
+                    html_content += f"        <h1>{text}</h1>\n"
+                    title_found = True
+                    print(f"Added H1 (heuristic): {text[:30]}")
+                elif len(text) < 150 and text.isupper():
+                    html_content += f"        <p><strong>{text}</strong></p>\n"
+                    print(f"Added STRONG: {text[:30]}")
+                else:
+                    if title_found and page_num == 0 and ("Pubblicato il" in text or "di Lucio CARACCIOLO" in text or "Home" in text):
+                        print(f"Skipped (metadata): {text[:30]}")
+                        continue
+                    html_content += f"        <p>{text}</p>\n"
+                    print(f"Added P: {text[:30]}")
+
+            elif block["type"] == 1:  # Image block
+                try:
+                    image_bytes = block["image"]
+                    base64_image = base64.b64encode(image_bytes).decode('utf-8')
+                    img_format = block.get("ext", "png")
+                    html_content += f'        <img src="data:image/{img_format};base64,{base64_image}" />\n'
+                except Exception as e:
+                    pass
+
+    html_content += """    </div>
+</body>
+</html>"""
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(html_content)
+
+    print(f"HTML generated successfully at {output_path}")
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        pdf_file = sys.argv[1]
+    else:
+        pdf_file = "Senza un piano di tregua, le nazioni europee scivolano verso la guerra - Limes.pdf"
+
+    output_file = "public/index.html"
+    extract_pdf_to_html(pdf_file, output_file)
--- a/public/index.html
+++ b/public/index.html
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+pymupdf
+Pillow