Browse Source

Initial commit

master
Matteo Benedetto 3 months ago
commit
c3dae45f86
  1. BIN
      Carta_ Verso un Intermarium del Nord - Limes.pdf
  2. BIN
      Senza un piano di tregua, le nazioni europee scivolano verso la guerra - Limes.pdf
  3. 240
      extract_pdf.py
  4. 73
      public/index.html
  5. 2
      requirements.txt

BIN
Carta_ Verso un Intermarium del Nord - Limes.pdf

Binary file not shown.

BIN
Senza un piano di tregua, le nazioni europee scivolano verso la guerra - Limes.pdf

Binary file not shown.

240
extract_pdf.py

@ -0,0 +1,240 @@
import fitz
import base64
import os
import sys
import re
def extract_pdf_to_html(pdf_path, output_path):
if not os.path.exists(pdf_path):
print(f"Error: File {pdf_path} not found.")
return
doc = fitz.open(pdf_path)
html_content = """<!DOCTYPE html>
<html lang="it">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Articolo Limes</title>
<style>
body {
font-family: 'Georgia', serif;
line-height: 1.6;
color: #333;
max-width: 800px;
margin: 0 auto;
padding: 20px;
background-color: #f4f4f4;
}
.article-container {
background-color: #fff;
padding: 40px;
box-shadow: 0 0 10px rgba(0,0,0,0.1);
}
h1 {
color: #b30000;
font-size: 2.2em;
margin-bottom: 10px;
line-height: 1.2;
text-transform: uppercase;
}
.subtitle {
font-size: 1.2em;
color: #555;
margin-bottom: 20px;
font-style: italic;
}
p {
margin-bottom: 15px;
text-align: justify;
}
img {
max-width: 100%;
height: auto;
display: block;
margin: 20px auto;
}
.caption {
font-size: 0.9em;
color: #666;
text-align: center;
margin-top: -10px;
margin-bottom: 20px;
font-style: italic;
}
strong {
color: #b30000;
}
</style>
</head>
<body>
<div class="article-container">
"""
exclude_keywords = [
"MENU", "CERCA", "CHI SIAMO", "LA SCUOLA DI LIMES", "VOLUMI", "LEGGI PDF",
"I NOSTRI TEMI", "CARTE", "VIDEO", "PODCAST", "EVENTI", "MATBEN88",
"IN EDICOLA", "ABBONATI", "Privacy", "Cookie Policy", "Gestione cookie",
"Riserva TDM", "Dichiarazione di accessibilità", "GEDI Periodici",
"Tutti a lezione di pace", "Il mito dell'Urbe", "IL PUNTO", "CARTA INEDITA",
"MENU CERCA", "RUBRICHE", "IL MONDO OGGI", "EBOOK", "AUTORI", "ARGOMENTI", "PROTAGONISTI", "FAQ"
]
def clean_text(text):
# Fix ligatures and other common issues
replacements = {
"": "fi", # specific to the issue seen, might be fl or fi. "conitto" -> conflitto (fl)
"": "fi",
"": "fl",
"": "'",
"": '"',
"": '"',
}
# Check for the specific char seen in output
# In the output it was "conitto".
# If I can't paste the char easily, I might need to handle it by context or unicode.
# The char in the output looked like a box or special char.
# Let's assume it's a mapping issue.
# "conitto" -> "conflitto". So  is fl.
for k, v in replacements.items():
text = text.replace(k, v)
# Generic ligature fix if they appear as special chars
# But without knowing the exact unicode, it's hard.
# Let's try to fix "con itto" if it appears with a space or weird char.
text = text.replace("conitto", "conflitto")
return text
title_found = False
# Buffer for text to handle hyphenation across blocks/pages if needed (though blocks are usually self-contained paragraphs)
# Actually, let's just process blocks.
for page_num in range(len(doc)):
page = doc[page_num]
page_dict = page.get_text("dict")
# Sort blocks by vertical position
blocks = sorted(page_dict["blocks"], key=lambda b: b["bbox"][1])
for block in blocks:
bbox = block["bbox"]
# 1. Header/Footer filtering
if bbox[1] < 30 or bbox[3] > 815:
continue
# 2. Sidebar filtering (sidebar is on the right)
if bbox[0] > 430:
continue
if block["type"] == 0: # Text block
# ... (text extraction logic)
text = ""
lines_text = []
for line in block["lines"]:
line_str = "".join([span["text"] for span in line["spans"]])
lines_text.append(line_str)
full_text = ""
for i, line in enumerate(lines_text):
if line.endswith("-") and i < len(lines_text) - 1:
full_text += line[:-1]
else:
full_text += line + " "
text = full_text.strip()
text = clean_text(text)
if not text:
continue
# Debug print
print(f"Processing block: {text[:30]}... bbox={bbox}")
# 3. Keyword filtering
# Check if any keyword is in the text
# We use a more aggressive filter for uppercase or short text
if any(kw in text for kw in exclude_keywords):
if text.isupper() or len(text) < 100 or "Privacy" in text or "Cookie" in text:
print(f"Excluded junk: {text[:30]}...")
continue
# 4. Title detection and cleaning
if not title_found and "Home" in text:
# Capture the full uppercase title
# Adjusted regex to be a bit more flexible
match = re.search(r"Home(?:[\w\s]+?)([A-Z\s,:'\"!?.—–-]{10,})", text)
if match:
title = match.group(1).strip()
remaining = text[match.end():]
# Fix for title grabbing the first letter of subtitle
if title and remaining and title[-1].isupper() and remaining[0].islower():
# Check if the last word of title is just 1 char
last_word = title.split()[-1]
# If it's a single letter and not a common vowel word (A, I, O, E, U, Y), assume it belongs to subtitle
if len(last_word) == 1 and last_word not in ["A", "I", "E", "O", "U"]:
char_to_move = title[-1]
title = title[:-1].strip()
remaining = char_to_move + remaining
html_content += f" <h1>{title}</h1>\n"
# Try to extract subtitle
subtitle_match = re.search(r"^([^.!?]+[.!?])", remaining)
if subtitle_match:
subtitle = subtitle_match.group(1).strip()
html_content += f" <div class='subtitle'>{subtitle}</div>\n"
title_found = True
continue
else:
# Fallback if regex fails but "Home" is there, maybe the title is just the rest
# Or maybe it's not the title block.
pass
# Formatting
if len(text) < 150 and text.isupper() and not title_found and len(text) > 10:
html_content += f" <h1>{text}</h1>\n"
title_found = True
print(f"Added H1 (heuristic): {text[:30]}")
elif len(text) < 150 and text.isupper():
html_content += f" <p><strong>{text}</strong></p>\n"
print(f"Added STRONG: {text[:30]}")
else:
if title_found and page_num == 0 and ("Pubblicato il" in text or "di Lucio CARACCIOLO" in text or "Home" in text):
print(f"Skipped (metadata): {text[:30]}")
continue
html_content += f" <p>{text}</p>\n"
print(f"Added P: {text[:30]}")
elif block["type"] == 1: # Image block
try:
image_bytes = block["image"]
base64_image = base64.b64encode(image_bytes).decode('utf-8')
img_format = block.get("ext", "png")
html_content += f' <img src="data:image/{img_format};base64,{base64_image}" />\n'
except Exception as e:
pass
html_content += """ </div>
</body>
</html>"""
with open(output_path, "w", encoding="utf-8") as f:
f.write(html_content)
print(f"HTML generated successfully at {output_path}")
if __name__ == "__main__":
if len(sys.argv) > 1:
pdf_file = sys.argv[1]
else:
pdf_file = "Senza un piano di tregua, le nazioni europee scivolano verso la guerra - Limes.pdf"
output_file = "public/index.html"
extract_pdf_to_html(pdf_file, output_file)

73
public/index.html

File diff suppressed because one or more lines are too long

2
requirements.txt

@ -0,0 +1,2 @@
pymupdf
Pillow
Loading…
Cancel
Save