web-crawling

Running

App Files Files Community

pvanand commited on Oct 8, 2024

Commit

f6f97c8

verified ·

1 Parent(s): 9e5e37a

Update file_conversion.py

Browse files

Files changed (1) hide show

file_conversion.py +76 -27

file_conversion.py CHANGED Viewed

@@ -1,51 +1,100 @@
-from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks
 from fastapi.responses import FileResponse
 from pdf2docx import Converter
 import os
-import logging
 import shutil
 router = APIRouter()
-# Setup logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Define the temp directory
 TEMP_DIR = "/.tempfiles"
 def remove_file(path: str):
     if os.path.exists(path):
         os.unlink(path)
 @router.post("/convert/pdf_to_docx")
 async def convert_pdf_to_docx(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
     if not file.filename.endswith('.pdf'):
         raise HTTPException(status_code=400, detail="File must be a PDF")
-    os.makedirs(TEMP_DIR, exist_ok=True)
-    pdf_temp_path = os.path.join(TEMP_DIR, f"temp_{file.filename}")
     docx_temp_path = pdf_temp_path.replace('.pdf', '.docx')
     try:
-        with open(pdf_temp_path, "wb") as pdf_file:
-            shutil.copyfileobj(file.file, pdf_file)
-        cv = Converter(pdf_temp_path)
-        cv.convert(docx_temp_path)
-        cv.close()
-        if not os.path.exists(docx_temp_path):
-            raise FileNotFoundError(f"Converted file not found: {docx_temp_path}")
-        background_tasks.add_task(remove_file, pdf_temp_path)
-        background_tasks.add_task(remove_file, docx_temp_path)
-        return FileResponse(
-            docx_temp_path,
-            media_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-            filename=file.filename.replace('.pdf', '.docx')
-        )
     except Exception as e:
         remove_file(pdf_temp_path)
         remove_file(docx_temp_path)

+from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks, Response
 from fastapi.responses import FileResponse
+from pydantic import BaseModel
 from pdf2docx import Converter
 import os
 import shutil
+import pdfkit
+import uuid
 router = APIRouter()
 TEMP_DIR = "/.tempfiles"
+class HTMLRequest(BaseModel):
+    html_content: str
+def ensure_temp_dir():
+    os.makedirs(TEMP_DIR, exist_ok=True)
 def remove_file(path: str):
     if os.path.exists(path):
         os.unlink(path)
+def generate_temp_filepath(extension: str) -> str:
+    return os.path.join(TEMP_DIR, f"temp_{uuid.uuid4()}.{extension}")
+def html_to_pdf(html_content: str, output_path: str) -> None:
+    options = {
+        'page-size': 'A4',
+        'margin-top': '0.75in',
+        'margin-right': '0.75in',
+        'margin-bottom': '0.75in',
+        'margin-left': '0.75in',
+        'encoding': "UTF-8",
+    }
+    pdfkit.from_string(html_content, output_path, options=options)
+def pdf_to_docx(pdf_path: str, docx_path: str) -> None:
+    cv = Converter(pdf_path)
+    cv.convert(docx_path)
+    cv.close()
+def handle_conversion(convert_func, input_path: str, output_path: str, background_tasks: BackgroundTasks):
+    try:
+        convert_func(input_path, output_path)
+        if not os.path.exists(output_path):
+            raise FileNotFoundError(f"Converted file not found: {output_path}")
+        background_tasks.add_task(remove_file, input_path)
+        background_tasks.add_task(remove_file, output_path)
+        return FileResponse(
+            output_path,
+            media_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+            filename=f"converted_document_{uuid.uuid4()}.docx"
+        )
+    except Exception as e:
+        remove_file(input_path)
+        remove_file(output_path)
+        raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
 @router.post("/convert/pdf_to_docx")
 async def convert_pdf_to_docx(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
     if not file.filename.endswith('.pdf'):
         raise HTTPException(status_code=400, detail="File must be a PDF")
+    ensure_temp_dir()
+    pdf_temp_path = generate_temp_filepath("pdf")
     docx_temp_path = pdf_temp_path.replace('.pdf', '.docx')
+    with open(pdf_temp_path, "wb") as pdf_file:
+        shutil.copyfileobj(file.file, pdf_file)
+    return handle_conversion(pdf_to_docx, pdf_temp_path, docx_temp_path, background_tasks)
+@router.post("/convert/html_to_pdf")
+async def convert_html_to_pdf(request: HTMLRequest):
+    ensure_temp_dir()
+    pdf_temp_path = generate_temp_filepath("pdf")
     try:
+        html_to_pdf(request.html_content, pdf_temp_path)
+        with open(pdf_temp_path, "rb") as pdf_file:
+            pdf_content = pdf_file.read()
+        remove_file(pdf_temp_path)
+        return Response(content=pdf_content, media_type="application/pdf")
+    except Exception as e:
+        remove_file(pdf_temp_path)
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/convert/html_to_docx")
+async def convert_html_to_docx(background_tasks: BackgroundTasks, request: HTMLRequest):
+    ensure_temp_dir()
+    pdf_temp_path = generate_temp_filepath("pdf")
+    docx_temp_path = pdf_temp_path.replace('.pdf', '.docx')
+    try:
+        html_to_pdf(request.html_content, pdf_temp_path)
+        return handle_conversion(pdf_to_docx, pdf_temp_path, docx_temp_path, background_tasks)
     except Exception as e:
         remove_file(pdf_temp_path)
         remove_file(docx_temp_path)