Spaces:

Sanshruth
/

Bionic_Reading_Hub

Running

File size: 11,604 Bytes

import gradio as gr
from pdf2docx import Converter
from docx import Document
import os
import glob
import base64
from docx.shared import Inches, Pt
from docx.oxml import OxmlElement
from docx.enum.text import WD_ALIGN_PARAGRAPH
import xml.etree.ElementTree as ET

def find_ttf_fonts():
    files = glob.glob('**/*.ttf', recursive=True)
    return files

def embed_font_in_html(font_path, font_name, html_content):
    with open(font_path, "rb") as font_file:
        font_data = font_file.read()
    encoded_font = base64.b64encode(font_data).decode('utf-8')
    
    font_style = f"""
    <style>
    @font-face {{
        font-family: '{font_name}';
        src: url(data:font/ttf;base64,{encoded_font}) format('truetype');
    }}
    body {{
        font-family: '{font_name}', Arial, sans-serif;
        margin: 0;
        padding: 0;
        background-color: white;
    }}
    .page {{
        position: relative;
        width: 8.5in;
        margin: 20px auto;
        padding: 20px;
        box-sizing: border-box;
        background-color: white;
        box-shadow: 0 0 10px rgba(0,0,0,0.1);
    }}
    .paragraph {{
        margin: 0;
        padding: 0;
        position: relative;
    }}
    .image-container {{
        display: inline-block;
        position: relative;
        vertical-align: middle;
    }}
    img {{
        max-width: 100%;
        height: auto;
        display: inline-block;
        vertical-align: middle;
    }}
    table {{
        border-collapse: collapse;
        width: 100%;
        margin: 10px 0;
    }}
    td, th {{
        border: 1px solid black;
        padding: 8px;
        position: relative;
    }}
    </style>
    """
    return font_style + html_content

def extract_images_from_doc(doc):
    images = {}
    for rel in doc.part.rels.values():
        if "image" in rel.reltype:
            try:
                image_data = rel.target_part.blob
                image_type = rel.target_part.content_type.split('/')[-1]
                if image_type.lower() not in ['jpeg', 'jpg', 'png', 'gif']:
                    image_type = 'png'
                encoded_image = base64.b64encode(image_data).decode('utf-8')
                images[rel.rId] = f"data:image/{image_type};base64,{encoded_image}"
            except Exception as e:
                print(f"Error processing image: {str(e)}")
                continue
    return images

def get_image_position(element):
    try:
        anchor = element.find('.//wp:anchor', 
            {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
        if anchor is not None:
            pos_h = anchor.find('.//wp:positionH', 
                {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
            pos_v = anchor.find('.//wp:positionV', 
                {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
            
            if pos_h is not None and pos_v is not None:
                x = pos_h.find('.//wp:posOffset', 
                    {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
                y = pos_v.find('.//wp:posOffset', 
                    {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
                
                if x is not None and y is not None:
                    return {
                        'x': int(x.text) / 914400,
                        'y': int(y.text) / 914400
                    }
    except Exception:
        pass
    return None

def process_paragraph(paragraph, images_dict):
    html_content = '<div class="paragraph">'
    
    if paragraph.alignment == WD_ALIGN_PARAGRAPH.CENTER:
        html_content += '<div style="text-align: center;">'
    elif paragraph.alignment == WD_ALIGN_PARAGRAPH.RIGHT:
        html_content += '<div style="text-align: right;">'
    else:
        html_content += '<div>'
    
    for run in paragraph.runs:
        style = []
        if run.bold: style.append('font-weight: bold')
        if run.italic: style.append('font-style: italic')
        if run.underline: style.append('text-decoration: underline')
        if run.font.size: style.append(f'font-size: {run.font.size.pt}pt')
        
        drawing_elements = run._element.findall('.//w:drawing',
            {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
        
        for drawing in drawing_elements:
            blip = drawing.find('.//a:blip',
                {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
            if blip is not None:
                image_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
                if image_rel_id in images_dict:
                    position = get_image_position(drawing)
                    if position:
                        style_pos = f"position: absolute; left: {position['x']}in; top: {position['y']}in;"
                        html_content += f'<div class="image-container" style="{style_pos}">'
                        html_content += f'<img src="{images_dict[image_rel_id]}" alt="Document Image"/>'
                        html_content += '</div>'
                    else:
                        html_content += f'<div class="image-container">'
                        html_content += f'<img src="{images_dict[image_rel_id]}" alt="Document Image"/>'
                        html_content += '</div>'
        
        style_str = '; '.join(style)
        if run.text.strip():
            html_content += f'<span style="{style_str}">{run.text}</span>'
    
    html_content += '</div></div>'
    return html_content

def process_table(table, images_dict):
    html_content = '<table>'
    for row in table.rows:
        html_content += '<tr>'
        for cell in row.cells:
            html_content += '<td>'
            for paragraph in cell.paragraphs:
                for run in paragraph.runs:
                    style = []
                    if run.bold: style.append('font-weight: bold')
                    if run.italic: style.append('font-style: italic')
                    if run.underline: style.append('text-decoration: underline')
                    if run.font.size: style.append(f'font-size: {run.font.size.pt}pt')
                    
                    drawing_elements = run._element.findall('.//w:drawing',
                        {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
                    
                    for drawing in drawing_elements:
                        blip = drawing.find('.//a:blip',
                            {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
                        if blip is not None:
                            image_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
                            if image_rel_id in images_dict:
                                html_content += f'<div class="image-container">'
                                html_content += f'<img src="{images_dict[image_rel_id]}" alt="Table Cell Image"/>'
                                html_content += '</div>'
                    
                    style_str = '; '.join(style)
                    if run.text.strip():
                        html_content += f'<span style="{style_str}">{run.text}</span>'
            html_content += '</td>'
        html_content += '</tr>'
    html_content += '</table>'
    return html_content

def pdf_to_html(pdf_file, font_name):
    if not pdf_file:
        return None
        
    try:
        docx_filename = pdf_file.name.replace('.pdf', '.docx')
        cv = Converter(pdf_file.name)
        cv.convert(docx_filename)
        cv.close()
        
        doc = Document(docx_filename)
        images_dict = extract_images_from_doc(doc)
        
        html_content = """<!DOCTYPE html>
        <html>
        <head>
            <meta charset='utf-8'>
            <title>Converted Document</title>
        </head>
        <body>
        <div class="page">
        """
        
        paragraph_map = {}
        current_paragraph_index = 0
        for para in doc.paragraphs:
            paragraph_map[para._element] = current_paragraph_index
            current_paragraph_index += 1
        
        for element in doc.element.body:
            if element.tag.endswith('p'):
                if element in paragraph_map:
                    paragraph = doc.paragraphs[paragraph_map[element]]
                    html_content += process_paragraph(paragraph, images_dict)
            elif element.tag.endswith('tbl'):
                table_index = len([e for e in doc.element.body[:doc.element.body.index(element)] 
                                 if e.tag.endswith('tbl')])
                html_content += process_table(doc.tables[table_index], images_dict)
        
        html_content += "</div></body></html>"
        
        ttf_files = {os.path.basename(f): f for f in find_ttf_fonts()}
        if font_name in ttf_files:
            font_path = ttf_files[font_name]
            font_name_clean = os.path.splitext(font_name)[0]
            html_content = embed_font_in_html(font_path, font_name_clean, html_content)
        
        html_filename = "output_with_font.html"
        with open(html_filename, "w", encoding="utf-8") as html_file:
            html_file.write(html_content)
        
        os.remove(docx_filename)
        return html_filename
        
    except Exception as e:
        print(f"Error in pdf_to_html: {str(e)}")
        return None

# Gradio Interface
with gr.Blocks(theme=gr.themes.Soft()) as app:
    gr.Markdown("# Bionic Reading PDF Converter")
    gr.Markdown("### https://github.com/SanshruthR/Bionic_Reading_Hub")
    
    
    with gr.Row():
        gr.Image("image.jpeg", 
                label="Bionic Reading Example", 
                show_label=False,
                width=400,
                height=300)


    with gr.Row():
        with gr.Column(scale=2):
            pdf_input = gr.File(
                label="Upload Your PDF",
                file_types=[".pdf"],
                file_count="single"
            )
            
            ttf_files = find_ttf_fonts()
            font_dropdown = gr.Dropdown(
                [os.path.basename(font) for font in ttf_files],
                label="Select Font Style",
                value=os.path.basename(ttf_files[0]) if ttf_files else None,
                info="Choose your preferred reading font"
            )
            
            convert_pdf_to_html = gr.Button(
                "Convert to Bionic Format",
                variant="primary",
                size="lg"
            )
            
            font_output = gr.File(
                label="Download Enhanced HTML File",
                type="filepath"
            )

    with gr.Row():
        example_files = [
            os.path.join("examples", f) 
            for f in os.listdir("examples") 
            if f.endswith('.pdf')
        ] if os.path.exists("examples") else []
        
        if example_files:
            gr.Examples(
                example_files,
                pdf_input,
                label="Sample PDFs"
            )
    
    with gr.Row():
        gr.Markdown(
            """
            ---
            📝 Best results with text-based PDFs (not scanned documents)
            """
        )
            
    convert_pdf_to_html.click(
        pdf_to_html,
        inputs=[pdf_input, font_dropdown],
        outputs=[font_output]
    )

app.launch()