|
|
|
|
@ -0,0 +1,240 @@
|
|
|
|
|
import fitz |
|
|
|
|
import base64 |
|
|
|
|
import os |
|
|
|
|
import sys |
|
|
|
|
import re |
|
|
|
|
|
|
|
|
|
def extract_pdf_to_html(pdf_path, output_path): |
|
|
|
|
if not os.path.exists(pdf_path): |
|
|
|
|
print(f"Error: File {pdf_path} not found.") |
|
|
|
|
return |
|
|
|
|
|
|
|
|
|
doc = fitz.open(pdf_path) |
|
|
|
|
|
|
|
|
|
html_content = """<!DOCTYPE html> |
|
|
|
|
<html lang="it"> |
|
|
|
|
<head> |
|
|
|
|
<meta charset="UTF-8"> |
|
|
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
|
|
|
<title>Articolo Limes</title> |
|
|
|
|
<style> |
|
|
|
|
body { |
|
|
|
|
font-family: 'Georgia', serif; |
|
|
|
|
line-height: 1.6; |
|
|
|
|
color: #333; |
|
|
|
|
max-width: 800px; |
|
|
|
|
margin: 0 auto; |
|
|
|
|
padding: 20px; |
|
|
|
|
background-color: #f4f4f4; |
|
|
|
|
} |
|
|
|
|
.article-container { |
|
|
|
|
background-color: #fff; |
|
|
|
|
padding: 40px; |
|
|
|
|
box-shadow: 0 0 10px rgba(0,0,0,0.1); |
|
|
|
|
} |
|
|
|
|
h1 { |
|
|
|
|
color: #b30000; |
|
|
|
|
font-size: 2.2em; |
|
|
|
|
margin-bottom: 10px; |
|
|
|
|
line-height: 1.2; |
|
|
|
|
text-transform: uppercase; |
|
|
|
|
} |
|
|
|
|
.subtitle { |
|
|
|
|
font-size: 1.2em; |
|
|
|
|
color: #555; |
|
|
|
|
margin-bottom: 20px; |
|
|
|
|
font-style: italic; |
|
|
|
|
} |
|
|
|
|
p { |
|
|
|
|
margin-bottom: 15px; |
|
|
|
|
text-align: justify; |
|
|
|
|
} |
|
|
|
|
img { |
|
|
|
|
max-width: 100%; |
|
|
|
|
height: auto; |
|
|
|
|
display: block; |
|
|
|
|
margin: 20px auto; |
|
|
|
|
} |
|
|
|
|
.caption { |
|
|
|
|
font-size: 0.9em; |
|
|
|
|
color: #666; |
|
|
|
|
text-align: center; |
|
|
|
|
margin-top: -10px; |
|
|
|
|
margin-bottom: 20px; |
|
|
|
|
font-style: italic; |
|
|
|
|
} |
|
|
|
|
strong { |
|
|
|
|
color: #b30000; |
|
|
|
|
} |
|
|
|
|
</style> |
|
|
|
|
</head> |
|
|
|
|
<body> |
|
|
|
|
<div class="article-container"> |
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
exclude_keywords = [ |
|
|
|
|
"MENU", "CERCA", "CHI SIAMO", "LA SCUOLA DI LIMES", "VOLUMI", "LEGGI PDF", |
|
|
|
|
"I NOSTRI TEMI", "CARTE", "VIDEO", "PODCAST", "EVENTI", "MATBEN88", |
|
|
|
|
"IN EDICOLA", "ABBONATI", "Privacy", "Cookie Policy", "Gestione cookie", |
|
|
|
|
"Riserva TDM", "Dichiarazione di accessibilità", "GEDI Periodici", |
|
|
|
|
"Tutti a lezione di pace", "Il mito dell'Urbe", "IL PUNTO", "CARTA INEDITA", |
|
|
|
|
"MENU CERCA", "RUBRICHE", "IL MONDO OGGI", "EBOOK", "AUTORI", "ARGOMENTI", "PROTAGONISTI", "FAQ" |
|
|
|
|
] |
|
|
|
|
|
|
|
|
|
def clean_text(text): |
|
|
|
|
# Fix ligatures and other common issues |
|
|
|
|
replacements = { |
|
|
|
|
"": "fi", # specific to the issue seen, might be fl or fi. "conitto" -> conflitto (fl) |
|
|
|
|
"fi": "fi", |
|
|
|
|
"fl": "fl", |
|
|
|
|
"’": "'", |
|
|
|
|
"“": '"', |
|
|
|
|
"”": '"', |
|
|
|
|
} |
|
|
|
|
# Check for the specific char seen in output |
|
|
|
|
# In the output it was "conitto". |
|
|
|
|
# If I can't paste the char easily, I might need to handle it by context or unicode. |
|
|
|
|
# The char in the output looked like a box or special char. |
|
|
|
|
# Let's assume it's a mapping issue. |
|
|
|
|
# "conitto" -> "conflitto". So is fl. |
|
|
|
|
|
|
|
|
|
for k, v in replacements.items(): |
|
|
|
|
text = text.replace(k, v) |
|
|
|
|
|
|
|
|
|
# Generic ligature fix if they appear as special chars |
|
|
|
|
# But without knowing the exact unicode, it's hard. |
|
|
|
|
# Let's try to fix "con itto" if it appears with a space or weird char. |
|
|
|
|
text = text.replace("conitto", "conflitto") |
|
|
|
|
|
|
|
|
|
return text |
|
|
|
|
|
|
|
|
|
title_found = False |
|
|
|
|
|
|
|
|
|
# Buffer for text to handle hyphenation across blocks/pages if needed (though blocks are usually self-contained paragraphs) |
|
|
|
|
# Actually, let's just process blocks. |
|
|
|
|
|
|
|
|
|
for page_num in range(len(doc)): |
|
|
|
|
page = doc[page_num] |
|
|
|
|
page_dict = page.get_text("dict") |
|
|
|
|
|
|
|
|
|
# Sort blocks by vertical position |
|
|
|
|
blocks = sorted(page_dict["blocks"], key=lambda b: b["bbox"][1]) |
|
|
|
|
|
|
|
|
|
for block in blocks: |
|
|
|
|
bbox = block["bbox"] |
|
|
|
|
|
|
|
|
|
# 1. Header/Footer filtering |
|
|
|
|
if bbox[1] < 30 or bbox[3] > 815: |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
# 2. Sidebar filtering (sidebar is on the right) |
|
|
|
|
if bbox[0] > 430: |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
if block["type"] == 0: # Text block |
|
|
|
|
# ... (text extraction logic) |
|
|
|
|
text = "" |
|
|
|
|
lines_text = [] |
|
|
|
|
for line in block["lines"]: |
|
|
|
|
line_str = "".join([span["text"] for span in line["spans"]]) |
|
|
|
|
lines_text.append(line_str) |
|
|
|
|
|
|
|
|
|
full_text = "" |
|
|
|
|
for i, line in enumerate(lines_text): |
|
|
|
|
if line.endswith("-") and i < len(lines_text) - 1: |
|
|
|
|
full_text += line[:-1] |
|
|
|
|
else: |
|
|
|
|
full_text += line + " " |
|
|
|
|
|
|
|
|
|
text = full_text.strip() |
|
|
|
|
text = clean_text(text) |
|
|
|
|
|
|
|
|
|
if not text: |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
# Debug print |
|
|
|
|
print(f"Processing block: {text[:30]}... bbox={bbox}") |
|
|
|
|
|
|
|
|
|
# 3. Keyword filtering |
|
|
|
|
# Check if any keyword is in the text |
|
|
|
|
# We use a more aggressive filter for uppercase or short text |
|
|
|
|
if any(kw in text for kw in exclude_keywords): |
|
|
|
|
if text.isupper() or len(text) < 100 or "Privacy" in text or "Cookie" in text: |
|
|
|
|
print(f"Excluded junk: {text[:30]}...") |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
# 4. Title detection and cleaning |
|
|
|
|
if not title_found and "Home" in text: |
|
|
|
|
# Capture the full uppercase title |
|
|
|
|
# Adjusted regex to be a bit more flexible |
|
|
|
|
match = re.search(r"Home(?:[\w\s]+?)([A-Z\s,:'\"!?.—–-]{10,})", text) |
|
|
|
|
if match: |
|
|
|
|
title = match.group(1).strip() |
|
|
|
|
remaining = text[match.end():] |
|
|
|
|
|
|
|
|
|
# Fix for title grabbing the first letter of subtitle |
|
|
|
|
if title and remaining and title[-1].isupper() and remaining[0].islower(): |
|
|
|
|
# Check if the last word of title is just 1 char |
|
|
|
|
last_word = title.split()[-1] |
|
|
|
|
# If it's a single letter and not a common vowel word (A, I, O, E, U, Y), assume it belongs to subtitle |
|
|
|
|
if len(last_word) == 1 and last_word not in ["A", "I", "E", "O", "U"]: |
|
|
|
|
char_to_move = title[-1] |
|
|
|
|
title = title[:-1].strip() |
|
|
|
|
remaining = char_to_move + remaining |
|
|
|
|
|
|
|
|
|
html_content += f" <h1>{title}</h1>\n" |
|
|
|
|
|
|
|
|
|
# Try to extract subtitle |
|
|
|
|
subtitle_match = re.search(r"^([^.!?]+[.!?])", remaining) |
|
|
|
|
if subtitle_match: |
|
|
|
|
subtitle = subtitle_match.group(1).strip() |
|
|
|
|
html_content += f" <div class='subtitle'>{subtitle}</div>\n" |
|
|
|
|
|
|
|
|
|
title_found = True |
|
|
|
|
continue |
|
|
|
|
else: |
|
|
|
|
# Fallback if regex fails but "Home" is there, maybe the title is just the rest |
|
|
|
|
# Or maybe it's not the title block. |
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
# Formatting |
|
|
|
|
if len(text) < 150 and text.isupper() and not title_found and len(text) > 10: |
|
|
|
|
html_content += f" <h1>{text}</h1>\n" |
|
|
|
|
title_found = True |
|
|
|
|
print(f"Added H1 (heuristic): {text[:30]}") |
|
|
|
|
elif len(text) < 150 and text.isupper(): |
|
|
|
|
html_content += f" <p><strong>{text}</strong></p>\n" |
|
|
|
|
print(f"Added STRONG: {text[:30]}") |
|
|
|
|
else: |
|
|
|
|
if title_found and page_num == 0 and ("Pubblicato il" in text or "di Lucio CARACCIOLO" in text or "Home" in text): |
|
|
|
|
print(f"Skipped (metadata): {text[:30]}") |
|
|
|
|
continue |
|
|
|
|
html_content += f" <p>{text}</p>\n" |
|
|
|
|
print(f"Added P: {text[:30]}") |
|
|
|
|
|
|
|
|
|
elif block["type"] == 1: # Image block |
|
|
|
|
try: |
|
|
|
|
image_bytes = block["image"] |
|
|
|
|
base64_image = base64.b64encode(image_bytes).decode('utf-8') |
|
|
|
|
img_format = block.get("ext", "png") |
|
|
|
|
html_content += f' <img src="data:image/{img_format};base64,{base64_image}" />\n' |
|
|
|
|
except Exception as e: |
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
html_content += """ </div> |
|
|
|
|
</body> |
|
|
|
|
</html>""" |
|
|
|
|
|
|
|
|
|
with open(output_path, "w", encoding="utf-8") as f: |
|
|
|
|
f.write(html_content) |
|
|
|
|
|
|
|
|
|
print(f"HTML generated successfully at {output_path}") |
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
if len(sys.argv) > 1: |
|
|
|
|
pdf_file = sys.argv[1] |
|
|
|
|
else: |
|
|
|
|
pdf_file = "Senza un piano di tregua, le nazioni europee scivolano verso la guerra - Limes.pdf" |
|
|
|
|
|
|
|
|
|
output_file = "public/index.html" |
|
|
|
|
extract_pdf_to_html(pdf_file, output_file) |