Matteo Benedetto 3 months ago
parent
commit
db7895c652
  1. 197
      extract_pdf.py
  2. 50
      public/index.md

197
extract_pdf.py

@ -4,73 +4,26 @@ import os
import sys
import re
def extract_pdf_to_html(pdf_path, output_path):
def clean_text(text):
# Fix common ligature issues and OCR errors
", "fi") text = text.replace("
text = text.replace("con itto", "conflitto")
text = text.replace("dicoltà", "difficoltà")
text = text.replace("innita", "infinita")
text = text.replace("Foreign Oce", "Foreign Office")
text = text.replace("pacico", "pacifico")
text = text.replace("di nire", "di finire")
text = text.replace("a nire", "a finire")
return text
def extract_pdf_to_md(pdf_path, output_path):
if not os.path.exists(pdf_path):
print(f"Error: File {pdf_path} not found.")
return
doc = fitz.open(pdf_path)
html_content = """<!DOCTYPE html>
<html lang="it">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Articolo Limes</title>
<style>
body {
font-family: 'Georgia', serif;
line-height: 1.6;
color: #333;
max-width: 800px;
margin: 0 auto;
padding: 20px;
background-color: #f4f4f4;
}
.article-container {
background-color: #fff;
padding: 40px;
box-shadow: 0 0 10px rgba(0,0,0,0.1);
}
h1 {
color: #b30000;
font-size: 2.2em;
margin-bottom: 10px;
line-height: 1.2;
text-transform: uppercase;
}
.subtitle {
font-size: 1.2em;
color: #555;
margin-bottom: 20px;
font-style: italic;
}
p {
margin-bottom: 15px;
text-align: justify;
}
img {
max-width: 100%;
height: auto;
display: block;
margin: 20px auto;
}
.caption {
font-size: 0.9em;
color: #666;
text-align: center;
margin-top: -10px;
margin-bottom: 20px;
font-style: italic;
}
strong {
color: #b30000;
}
</style>
</head>
<body>
<div class="article-container">
"""
md_content = ""
exclude_keywords = [
"MENU", "CERCA", "CHI SIAMO", "LA SCUOLA DI LIMES", "VOLUMI", "LEGGI PDF",
@ -78,40 +31,11 @@ def extract_pdf_to_html(pdf_path, output_path):
"IN EDICOLA", "ABBONATI", "Privacy", "Cookie Policy", "Gestione cookie",
"Riserva TDM", "Dichiarazione di accessibilità", "GEDI Periodici",
"Tutti a lezione di pace", "Il mito dell'Urbe", "IL PUNTO", "CARTA INEDITA",
"MENU CERCA", "RUBRICHE", "IL MONDO OGGI", "EBOOK", "AUTORI", "ARGOMENTI", "PROTAGONISTI", "FAQ"
"AREA GEOPOLITICA", "ARGOMENTI PIÙ TRATTATI", "Per consultare la carta",
"africa/america latina", "europa/italia", "medio oriente", "russia e csi", "usa e canada"
]
def clean_text(text):
# Fix ligatures and other common issues
replacements = {
"": "fi", # specific to the issue seen, might be fl or fi. "conitto" -> conflitto (fl)
"": "fi",
"": "fl",
"": "'",
"": '"',
"": '"',
}
# Check for the specific char seen in output
# In the output it was "conitto".
# If I can't paste the char easily, I might need to handle it by context or unicode.
# The char in the output looked like a box or special char.
# Let's assume it's a mapping issue.
# "conitto" -> "conflitto". So  is fl.
for k, v in replacements.items():
text = text.replace(k, v)
# Generic ligature fix if they appear as special chars
# But without knowing the exact unicode, it's hard.
# Let's try to fix "con itto" if it appears with a space or weird char.
text = text.replace("conitto", "conflitto")
return text
title_found = False
# Buffer for text to handle hyphenation across blocks/pages if needed (though blocks are usually self-contained paragraphs)
# Actually, let's just process blocks.
for page_num in range(len(doc)):
page = doc[page_num]
@ -128,107 +52,72 @@ def extract_pdf_to_html(pdf_path, output_path):
continue
# 2. Sidebar filtering (sidebar is on the right)
if bbox[0] > 430:
if bbox[0] > 430: # Tightened from 440
continue
if block["type"] == 0: # Text block
# ... (text extraction logic)
text = ""
lines_text = []
for line in block["lines"]:
line_str = "".join([span["text"] for span in line["spans"]])
lines_text.append(line_str)
full_text = ""
for i, line in enumerate(lines_text):
if line.endswith("-") and i < len(lines_text) - 1:
full_text += line[:-1]
else:
full_text += line + " "
text = full_text.strip()
text = clean_text(text)
for span in line["spans"]:
text += span["text"]
text = text.strip()
if not text:
continue
# Debug print
print(f"Processing block: {text[:30]}... bbox={bbox}")
# Clean text
text = clean_text(text)
# 3. Keyword filtering
# Check if any keyword is in the text
# We use a more aggressive filter for uppercase or short text
if any(kw in text for kw in exclude_keywords):
if text.isupper() or len(text) < 100 or "Privacy" in text or "Cookie" in text:
print(f"Excluded junk: {text[:30]}...")
continue
if any(kw.lower() in text.lower() for kw in exclude_keywords):
continue
# Specific check for the footer line with slashes
if "africa/america latina" in text.lower():
continue
# 4. Title detection and cleaning
if not title_found and "Home" in text:
# Capture the full uppercase title
# Adjusted regex to be a bit more flexible
match = re.search(r"Home(?:[\w\s]+?)([A-Z\s,:'\"!?.—–-]{10,})", text)
match = re.search(r"Home(?:Carta Inedita della Settimana|Il Punto|[\w\s]+?)([A-Z\s,:'\"!?.—–-]{10,})", text)
if match:
title = match.group(1).strip()
remaining = text[match.end():]
# Fix for title grabbing the first letter of subtitle
if title and remaining and title[-1].isupper() and remaining[0].islower():
# Check if the last word of title is just 1 char
last_word = title.split()[-1]
# If it's a single letter and not a common vowel word (A, I, O, E, U, Y), assume it belongs to subtitle
if len(last_word) == 1 and last_word not in ["A", "I", "E", "O", "U"]:
char_to_move = title[-1]
title = title[:-1].strip()
remaining = char_to_move + remaining
html_content += f" <h1>{title}</h1>\n"
md_content += f"# {title}\n\n"
# Try to extract subtitle
remaining = text[match.end():]
subtitle_match = re.search(r"^([^.!?]+[.!?])", remaining)
if subtitle_match:
subtitle = subtitle_match.group(1).strip()
html_content += f" <div class='subtitle'>{subtitle}</div>\n"
md_content += f"_{subtitle}_\n\n"
title_found = True
continue
else:
# Fallback if regex fails but "Home" is there, maybe the title is just the rest
# Or maybe it's not the title block.
pass
# Formatting
if len(text) < 150 and text.isupper() and not title_found and len(text) > 10:
html_content += f" <h1>{text}</h1>\n"
if len(text) < 100 and text.isupper() and not title_found and len(text) > 10:
md_content += f"# {text}\n\n"
title_found = True
print(f"Added H1 (heuristic): {text[:30]}")
elif len(text) < 150 and text.isupper():
html_content += f" <p><strong>{text}</strong></p>\n"
print(f"Added STRONG: {text[:30]}")
elif len(text) < 100 and text.isupper():
md_content += f"**{text}**\n\n"
else:
if title_found and page_num == 0 and ("Pubblicato il" in text or "di Lucio CARACCIOLO" in text or "Home" in text):
print(f"Skipped (metadata): {text[:30]}")
continue
html_content += f" <p>{text}</p>\n"
print(f"Added P: {text[:30]}")
md_content += f"{text}\n\n"
elif block["type"] == 1: # Image block
try:
image_bytes = block["image"]
base64_image = base64.b64encode(image_bytes).decode('utf-8')
img_format = block.get("ext", "png")
html_content += f' <img src="data:image/{img_format};base64,{base64_image}" />\n'
md_content += f'<img src="data:image/{img_format};base64,{base64_image}" />\n\n'
except Exception as e:
pass
html_content += """ </div>
</body>
</html>"""
with open(output_path, "w", encoding="utf-8") as f:
f.write(html_content)
f.write(md_content)
print(f"HTML generated successfully at {output_path}")
print(f"Markdown generated successfully at {output_path}")
if __name__ == "__main__":
if len(sys.argv) > 1:
@ -236,5 +125,5 @@ if __name__ == "__main__":
else:
pdf_file = "Senza un piano di tregua, le nazioni europee scivolano verso la guerra - Limes.pdf"
output_file = "public/index.html"
extract_pdf_to_html(pdf_file, output_file)
output_file = "public/index.md"
extract_pdf_to_md(pdf_file, output_file)

50
public/index.md

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save