cc

6 months ago · db7895c652
2 changed files with 93 additions and 154 deletions
--- a/extract_pdf.py
+++ b/extract_pdf.py
@ -4,73 +4,26 @@ import os
 import sys
 import re

-def extract_pdf_to_html(pdf_path, output_path):
+def clean_text(text):
+    # Fix common ligature issues and OCR errors
+", "fi")    text = text.replace("
+    text = text.replace("con itto", "conflitto")
+    text = text.replace("dicoltà", "difficoltà")
+    text = text.replace("innita", "infinita")
+    text = text.replace("Foreign Oce", "Foreign Office")
+    text = text.replace("pacico", "pacifico")
+    text = text.replace("di nire", "di finire")
+    text = text.replace("a nire", "a finire")
+    return text
+
+def extract_pdf_to_md(pdf_path, output_path):
    if not os.path.exists(pdf_path):
        print(f"Error: File {pdf_path} not found.")
        return

    doc = fitz.open(pdf_path)

-    html_content = """<!DOCTYPE html>
-<html lang="it">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Articolo Limes</title>
-    <style>
-        body {
-            font-family: 'Georgia', serif;
-            line-height: 1.6;
-            color: #333;
-            max-width: 800px;
-            margin: 0 auto;
-            padding: 20px;
-            background-color: #f4f4f4;
-        }
-        .article-container {
-            background-color: #fff;
-            padding: 40px;
-            box-shadow: 0 0 10px rgba(0,0,0,0.1);
-        }
-        h1 {
-            color: #b30000;
-            font-size: 2.2em;
-            margin-bottom: 10px;
-            line-height: 1.2;
-            text-transform: uppercase;
-        }
-        .subtitle {
-            font-size: 1.2em;
-            color: #555;
-            margin-bottom: 20px;
-            font-style: italic;
-        }
-        p {
-            margin-bottom: 15px;
-            text-align: justify;
-        }
-        img {
-            max-width: 100%;
-            height: auto;
-            display: block;
-            margin: 20px auto;
-        }
-        .caption {
-            font-size: 0.9em;
-            color: #666;
-            text-align: center;
-            margin-top: -10px;
-            margin-bottom: 20px;
-            font-style: italic;
-        }
-        strong {
-            color: #b30000;
-        }
-    </style>
-</head>
-<body>
-    <div class="article-container">
-"""
+    md_content = ""

    exclude_keywords = [
        "MENU", "CERCA", "CHI SIAMO", "LA SCUOLA DI LIMES", "VOLUMI", "LEGGI PDF",
@ -78,40 +31,11 @@ def extract_pdf_to_html(pdf_path, output_path):
        "IN EDICOLA", "ABBONATI", "Privacy", "Cookie Policy", "Gestione cookie",
        "Riserva TDM", "Dichiarazione di accessibilità", "GEDI Periodici",
        "Tutti a lezione di pace", "Il mito dell'Urbe", "IL PUNTO", "CARTA INEDITA",
-        "MENU CERCA", "RUBRICHE", "IL MONDO OGGI", "EBOOK", "AUTORI", "ARGOMENTI", "PROTAGONISTI", "FAQ"
+        "AREA GEOPOLITICA", "ARGOMENTI PIÙ TRATTATI", "Per consultare la carta",
+        "africa/america latina", "europa/italia", "medio oriente", "russia e csi", "usa e canada"
    ]

-    def clean_text(text):
-        # Fix ligatures and other common issues
-        replacements = {
-            "": "fi", # specific to the issue seen, might be fl or fi. "conitto" -> conflitto (fl)
-            "ﬁ": "fi",
-            "ﬂ": "fl",
-            "’": "'",
-            "“": '"',
-            "”": '"',
-        }
-        # Check for the specific char seen in output
-        # In the output it was "conitto". 
-        # If I can't paste the char easily, I might need to handle it by context or unicode.
-        # The char in the output looked like a box or special char.
-        # Let's assume it's a mapping issue.
-        # "conitto" -> "conflitto". So  is fl.
-        
-        for k, v in replacements.items():
-            text = text.replace(k, v)
-        
-        # Generic ligature fix if they appear as special chars
-        # But without knowing the exact unicode, it's hard.
-        # Let's try to fix "con itto" if it appears with a space or weird char.
-        text = text.replace("conitto", "conflitto")
-        
-        return text
-
    title_found = False
-    
-    # Buffer for text to handle hyphenation across blocks/pages if needed (though blocks are usually self-contained paragraphs)
-    # Actually, let's just process blocks.

    for page_num in range(len(doc)):
        page = doc[page_num]
@ -128,107 +52,72 @@ def extract_pdf_to_html(pdf_path, output_path):
                continue

            # 2. Sidebar filtering (sidebar is on the right)
-            if bbox[0] > 430:
+            if bbox[0] > 430: # Tightened from 440
                continue

            if block["type"] == 0:  # Text block
-                # ... (text extraction logic)
                text = ""
-                lines_text = []
                for line in block["lines"]:
-                    line_str = "".join([span["text"] for span in line["spans"]])
-                    lines_text.append(line_str)
-                
-                full_text = ""
-                for i, line in enumerate(lines_text):
-                    if line.endswith("-") and i < len(lines_text) - 1:
-                        full_text += line[:-1]
-                    else:
-                        full_text += line + " "
-                
-                text = full_text.strip()
-                text = clean_text(text)
+                    for span in line["spans"]:
+                        text += span["text"]

+                text = text.strip()
                if not text:
                    continue
-                
-                # Debug print
-                print(f"Processing block: {text[:30]}... bbox={bbox}")
+
+                # Clean text
+                text = clean_text(text)

                # 3. Keyword filtering
-                # Check if any keyword is in the text
-                # We use a more aggressive filter for uppercase or short text
-                if any(kw in text for kw in exclude_keywords):
-                    if text.isupper() or len(text) < 100 or "Privacy" in text or "Cookie" in text:
-                        print(f"Excluded junk: {text[:30]}...")
-                        continue
+                if any(kw.lower() in text.lower() for kw in exclude_keywords):
+                    continue
+                
+                # Specific check for the footer line with slashes
+                if "africa/america latina" in text.lower():
+                    continue

                # 4. Title detection and cleaning
                if not title_found and "Home" in text:
                    # Capture the full uppercase title
-                    # Adjusted regex to be a bit more flexible
-                    match = re.search(r"Home(?:[\w\s]+?)([A-Z\s,:'\"!?.—–-]{10,})", text)
+                    match = re.search(r"Home(?:Carta Inedita della Settimana|Il Punto|[\w\s]+?)([A-Z\s,:'\"!?.—–-]{10,})", text)
                    if match:
                        title = match.group(1).strip()
-                        remaining = text[match.end():]
-
-                        # Fix for title grabbing the first letter of subtitle
-                        if title and remaining and title[-1].isupper() and remaining[0].islower():
-                             # Check if the last word of title is just 1 char
-                             last_word = title.split()[-1]
-                             # If it's a single letter and not a common vowel word (A, I, O, E, U, Y), assume it belongs to subtitle
-                             if len(last_word) == 1 and last_word not in ["A", "I", "E", "O", "U"]:
-                                 char_to_move = title[-1]
-                                 title = title[:-1].strip()
-                                 remaining = char_to_move + remaining
-
-                        html_content += f"        <h1>{title}</h1>\n"
+                        md_content += f"# {title}\n\n"

                        # Try to extract subtitle
+                        remaining = text[match.end():]
                        subtitle_match = re.search(r"^([^.!?]+[.!?])", remaining)
                        if subtitle_match:
                            subtitle = subtitle_match.group(1).strip()
-                            html_content += f"        <div class='subtitle'>{subtitle}</div>\n"
+                            md_content += f"_{subtitle}_\n\n"

                        title_found = True
                        continue
-                    else:
-                         # Fallback if regex fails but "Home" is there, maybe the title is just the rest
-                         # Or maybe it's not the title block.
-                         pass

                # Formatting
-                if len(text) < 150 and text.isupper() and not title_found and len(text) > 10:
-                    html_content += f"        <h1>{text}</h1>\n"
+                if len(text) < 100 and text.isupper() and not title_found and len(text) > 10:
+                    md_content += f"# {text}\n\n"
                    title_found = True
-                    print(f"Added H1 (heuristic): {text[:30]}")
-                elif len(text) < 150 and text.isupper():
-                    html_content += f"        <p><strong>{text}</strong></p>\n"
-                    print(f"Added STRONG: {text[:30]}")
+                elif len(text) < 100 and text.isupper():
+                    md_content += f"**{text}**\n\n"
                else:
                    if title_found and page_num == 0 and ("Pubblicato il" in text or "di Lucio CARACCIOLO" in text or "Home" in text):
-                        print(f"Skipped (metadata): {text[:30]}")
                        continue
-                    html_content += f"        <p>{text}</p>\n"
-                    print(f"Added P: {text[:30]}")
+                    md_content += f"{text}\n\n"

            elif block["type"] == 1:  # Image block
                try:
                    image_bytes = block["image"]
                    base64_image = base64.b64encode(image_bytes).decode('utf-8')
                    img_format = block.get("ext", "png")
-                    html_content += f'        <img src="data:image/{img_format};base64,{base64_image}" />\n'
+                    md_content += f'<img src="data:image/{img_format};base64,{base64_image}" />\n\n'
                except Exception as e:
                    pass

-    html_content += """    </div>
-</body>
-</html>"""
-
    with open(output_path, "w", encoding="utf-8") as f:
-        f.write(html_content)
+        f.write(md_content)

-    print(f"HTML generated successfully at {output_path}")
+    print(f"Markdown generated successfully at {output_path}")

 if __name__ == "__main__":
    if len(sys.argv) > 1:
@ -236,5 +125,5 @@ if __name__ == "__main__":
    else:
        pdf_file = "Senza un piano di tregua, le nazioni europee scivolano verso la guerra - Limes.pdf"

-    output_file = "public/index.html"
-    extract_pdf_to_html(pdf_file, output_file)
+    output_file = "public/index.md"
+    extract_pdf_to_md(pdf_file, output_file)
--- a/public/index.md
+++ b/public/index.md