File size: 3,793 Bytes
cbc1d23 8ba144e cbc1d23 7589954 8ba144e 9fd45d8 8ba144e cbc1d23 8ba144e bff547d 8ba144e cbc1d23 8ba144e bff547d 8ba144e 7589954 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import run
import util
import docx
from docx.oxml.ns import qn
from docx.shared import Pt,RGBColor
import fitz
import os
from fpdf import FPDF
import run
from BERT_inference import BertClassificationModel
def text_dump_to_lines(text,topic_num,max_length):
lines = util.seg(text)
sentences = run.texClear(lines)
print(sentences)
keys, output = run.textToAb(sentences,lines,int(topic_num),int(max_length))
keysText = "\n".join(keys)
outputText = "\n".join(output)
print(keys,output)
return keysText, outputText, dump_to_txt(output), dump_to_docx(output), dump_to_pdf(output)
def file_dump_to_lines(file,topic_num,max_length):
lines = []
# print(file.name)
fileFormat = file.name.split(".")[-1]
# print(fileFormat)
if fileFormat == "txt":
with open(file.name, encoding='utf-8') as f:
content = f.read()
lines = [x.strip() for x in content.split("\n") if x.strip()!='']
elif fileFormat == "docx":
doc=docx.Document(file.name)
paragraphs = doc.paragraphs
lines = [par.text for par in paragraphs]
elif fileFormat == "pdf":
pdf = fitz.open(file.name)
for page in pdf:
pageText = page.get_text("text")
lines.extend([x.strip() for x in pageText.split("\n") if x.strip()!=''])
# print(lines)
text = "\n".join(lines)
print(text)
keysText, outputText, txt_path, docx_path, pdf_path = text_dump_to_lines(text,topic_num,max_length)
# sentences = run.texClear(lines)
# keys, output = run.textToAb(sentences,lines,int(topic_num),int(max_length))
# keysText = "\n".join(keys)
# outputText = "\n".join(output)
# # text = "\n".join(lines)
# # return text, text, dump_to_txt(lines), dump_to_docx(lines), dump_to_pdf(lines)
return keysText, outputText, txt_path, docx_path, pdf_path
def dump_to_txt(lines):
text = "\n".join(lines)
with open('temp.txt',mode="w",encoding="utf-8") as f:
f.write(text)
path = os.path.abspath('temp.txt')
return path
def dump_to_docx(lines):
document = docx.Document()
document.styles['Normal'].font.name = u'宋体'
document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
document.styles['Normal'].font.size = Pt(14)
document.styles['Normal'].font.color.rgb = RGBColor(0,0,0)
paragraph = document.add_paragraph()
run = paragraph.add_run()
#run.font.name = 'Times New Roman'
run.font.name=u'Cambria'
run.font.color.rgb = RGBColor(0,0,0)
run._element.rPr.rFonts.set(qn('w:eastAsia'), u'Cambria')
for line in lines:
document.add_paragraph(line)
document.save(r'temp.docx')
path = os.path.abspath('temp.docx')
return path
def dump_to_pdf(lines):
pdf = FPDF()
#读取字体文件
pdf.add_font('FZY3JW', '', 'FZY3JW.TTF', True)
pdf.add_page()
#设置pdf字体大小
pdf.set_font("FZY3JW", size=12)
#打开txt文本
try:
#按行读取txt文本内容
for line in lines:
str=line
num=len(str)
temp=45#判断标志,实现pdf文件每行最多村45个字符
for j in range(0,num,temp):
if(j+temp<num):
data=str[j:j+temp]
else:
data=str[j:num]
pdf.cell(0, 5, data, ln=1)
except Exception as e:
print(e)
pdf.output("temp.pdf")
path = os.path.abspath('temp.pdf')
return path
if __name__ == "__main__":
with open('test.txt', 'r', encoding='utf-8') as f:
data = f.read()
# print(data)
text_dump_to_lines(data,10,50) |