homepage/parser/md.py
2025-05-13 10:48:42 +02:00

168 lines
No EOL
5.8 KiB
Python

import markdown
import re
import os
import glob
from typing import List
def markdown_to_html_paragraphs(markdown_text: str) -> List[str]:
"""
Convert markdown text into a list of HTML paragraphs.
Supports mathematical equations using LaTeX syntax.
Args:
markdown_text (str): The markdown text to convert
Returns:
List[str]: A list of HTML paragraphs, each wrapped in <p> tags
"""
# Prepend "md/" to image paths if they don't already start with md/
markdown_text = re.sub(r'!\[(.*?)\]\((?!md/)([^/].*?\.assets/.*?)\)', r'![\1](/blog/md/\2)', markdown_text)
# Check if the first line starts with a # for h1 title
lines = markdown_text.split('\n')
has_h1_title = False
bold_title = None
if lines and lines[0].strip().startswith('#'):
has_h1_title = True
title_line = lines[0].strip().lstrip('#').strip()
bold_title = f'<p class="blog-title">{title_line}</p>'
# Remove the title from the markdown to avoid duplicate processing
markdown_text = '\n'.join(lines[1:])
else:
raise ValueError("No title found in the markdown file")
# Configure markdown with math extensions
extensions = [
'markdown.extensions.extra', # For blockquotes and other features
'markdown.extensions.fenced_code', # For code blocks
'markdown.extensions.codehilite', # For syntax highlighting
'markdown.extensions.attr_list', # For attributes
'markdown.extensions.md_in_html', # For markdown inside HTML
'mdx_math', # For math support
]
try:
# Try to use python-markdown-math which outputs compatible with MathJax 3
import pymdownx.arithmatex
extensions.remove('mdx_math')
extensions.append('pymdownx.arithmatex')
extension_configs = {
'pymdownx.arithmatex': {
'generic': True # Uses \(...\) for inline and \[...\] for display math
}
}
except ImportError:
# Fallback to mdx_math
extension_configs = {
'mdx_math': {
'enable_dollar_delimiter': True, # Enable $...$ for inline math
}
}
# Convert markdown to HTML with math support
html = markdown.markdown(
markdown_text,
extensions=extensions,
extension_configs=extension_configs
)
html = re.sub(r'<p>\s*(<img[^>]+>)\s*</p>', r'\1', html, flags=re.IGNORECASE)
# Convert image followed by blockquote to figure with caption
html = re.sub(
r'<img([^>]+)>\s*<blockquote>\s*<p>(.*?)</p>\s*</blockquote>',
r'<figure class="figure">\n <img\1 class="figure-img img-fluid rounded">\n <figcaption class="figure-caption">\2</figcaption>\n</figure>',
html,
flags=re.DOTALL
)
# Add "link" class and target="_blank" to all <a> tags
html = re.sub(r'<a(.*?)>', r'<a\1 class="link" target="_blank">', html)
html = re.sub(r'<a(.*?)class="(.*?)"(.*?)class="(.*?)"(.*?)>', r'<a\1class="\2 \4"\3\5>', html)
html = re.sub(r'<a(.*?)target="(.*?)"(.*?)target="(.*?)"(.*?)>', r'<a\1target="\2"\3\5>', html)
# Split the HTML into paragraphs
paragraphs = html.split('\n\n')
# Clean up and ensure each paragraph is properly wrapped
cleaned_paragraphs = []
# Add the bold title as the first element if it exists
if has_h1_title and bold_title:
cleaned_paragraphs.append(bold_title)
for p in paragraphs:
p = p.strip()
if p:
# If the paragraph doesn't already have <p> tags, add them
if not (p.startswith('<') and not p.startswith('<p>')):
p = f'<p>{p}</p>'
cleaned_paragraphs.append(p)
return cleaned_paragraphs, title_line
def insert_markdown_into_template(template_path: str, markdown_text: str) -> str:
"""
Insert parsed markdown content into the template HTML file.
Args:
template_path (str): Path to the template HTML file
markdown_text (str): The markdown text to convert and insert
Returns:
str: Complete HTML with markdown content inserted
"""
# Parse markdown into HTML paragraphs
html_paragraphs, title_line = markdown_to_html_paragraphs(markdown_text)
# Read the template
with open(template_path, 'r') as f:
template = f.read()
# Join paragraphs into a single string
content_html = '\n'.join(html_paragraphs)
# Insert the content into the template
complete_html = template.replace('{{ content }}', content_html)
# Replace {{ title }} placeholders with the extracted title
complete_html = complete_html.replace('{{ title }}', title_line)
return complete_html
def process_all_markdown_files():
"""
Process all markdown files in blog/md/ directory and generate HTML files in blog/html/.
"""
# Get all markdown files in blog/md/
md_files = glob.glob("dist/blog/md/*.md")
template_path = "dist/blog/template.html"
for md_file in md_files:
# Extract base filename without extension
base_name = os.path.basename(md_file)[:-3] # Remove .md extension
html_file = f"dist/blog/html/{base_name}.html"
print(f"Processing {md_file} -> {html_file}")
try:
# Read the markdown content
with open(md_file, "r") as f:
markdown_text = f.read()
# Generate HTML content
complete_html = insert_markdown_into_template(template_path, markdown_text)
# Write HTML output
with open(html_file, "w") as f:
f.write(complete_html)
except Exception as e:
print(f"Error processing {md_file}: {str(e)}")
if __name__ == "__main__":
process_all_markdown_files()