Samuel-DD07 commited on
Commit
3104437
·
1 Parent(s): e0724f2

Ajouter la prise en charge des fichiers PDF et PyPDF2

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. app.py +52 -57
  3. modeles.py +2 -51
  4. requirements.txt +2 -1
  5. uploadFile.py +11 -13
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
app.py CHANGED
@@ -4,16 +4,21 @@ from fastapi import FastAPI, UploadFile
4
  from typing import Union
5
  import json
6
  import csv
7
- from modeles import bert, squeezebert, deberta, loadSqueeze
8
  from uploadFile import file_to_text
9
  from typing import List
10
  from transformers import pipeline
11
  from pydantic import BaseModel
12
-
13
-
14
 
15
  app = FastAPI()
16
 
 
 
 
 
 
 
17
  app.add_middleware(
18
  CORSMiddleware,
19
  allow_origins=["*"],
@@ -22,53 +27,58 @@ app.add_middleware(
22
  allow_headers=["*"],
23
  )
24
 
25
- class SqueezeBERTRequest(BaseModel):
26
- context: str
27
- question: str
28
-
29
- class BERTRequest(BaseModel):
30
- context: str
31
- question: str
32
-
33
- class DeBERTaRequest(BaseModel):
34
- context: str
35
- question: str
36
-
37
- pipBert = pipeline('question-answering', model="ALOQAS/bert-large-uncased-finetuned-squad-v2", tokenizer="ALOQAS/bert-large-uncased-finetuned-squad-v2")
38
- pipDeberta = pipeline('question-answering', model="ALOQAS/deberta-large-finetuned-squad-v2", tokenizer="ALOQAS/deberta-large-finetuned-squad-v2")
39
- tokenizer, model = loadSqueeze()
40
 
41
  @app.get("/")
42
  async def root():
43
  return {"message": "Hello World"}
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  @app.post("/uploadfile/")
47
- async def create_upload_file(files: List[UploadFile], question: str, model: str):
48
- res = []
49
  for file in files:
50
- fileToText = await file_to_text(file)
51
-
52
- res.append({"model": model, "texte": question, "filename": file.filename, "file_to_text": fileToText})
53
-
54
- return res
55
-
56
-
57
- @app.post("/contextText/")
58
- async def create_upload_file(context: str, texte: str, model: str):
59
-
60
- return {"model": model, "texte": texte, "context": context}
61
-
62
-
63
- @app.post("/withoutFile/")
64
- async def create_upload_file(texte: str, model: str):
65
-
66
- return {"model": model, "texte": texte}
 
 
 
67
 
68
  @app.post("/squeezebert/")
69
- async def qasqueezebert(request: SqueezeBERTRequest):
70
  try:
71
- squeezebert_answer = squeezebert(request.context, request.question, model, tokenizer)
72
  if squeezebert_answer:
73
  return squeezebert_answer
74
  else:
@@ -77,7 +87,7 @@ async def qasqueezebert(request: SqueezeBERTRequest):
77
  raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
78
 
79
  @app.post("/bert/")
80
- async def qabert(request: BERTRequest):
81
  try:
82
  bert_answer = bert(request.context, request.question, pipBert)
83
  if bert_answer:
@@ -87,8 +97,8 @@ async def qabert(request: BERTRequest):
87
  except Exception as e:
88
  raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
89
 
90
- @app.post("/deberta-v2/")
91
- async def qadeberta(request: DeBERTaRequest):
92
  try:
93
  deberta_answer = deberta(request.context, request.question, pipDeberta)
94
  if deberta_answer:
@@ -97,18 +107,3 @@ async def qadeberta(request: DeBERTaRequest):
97
  raise HTTPException(status_code=404, detail="No answer found")
98
  except Exception as e:
99
  raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
100
-
101
- def extract_data(file: UploadFile) -> Union[str, dict, list]:
102
- if file.filename.endswith(".txt"):
103
- data = file.file.read()
104
- return data.decode("utf-8")
105
- elif file.filename.endswith(".csv"):
106
- data = file.file.read().decode("utf-8")
107
- rows = data.split("\n")
108
- reader = csv.DictReader(rows)
109
- return [dict(row) for row in reader]
110
- elif file.filename.endswith(".json"):
111
- data = file.file.read().decode("utf-8")
112
- return json.loads(data)
113
- else:
114
- return "Invalid file format"
 
4
  from typing import Union
5
  import json
6
  import csv
7
+ from modeles import bert, squeezebert, deberta
8
  from uploadFile import file_to_text
9
  from typing import List
10
  from transformers import pipeline
11
  from pydantic import BaseModel
12
+ from typing import Optional
 
13
 
14
  app = FastAPI()
15
 
16
+ class Request(BaseModel):
17
+ context: str
18
+ question: str
19
+ model: Optional[str] = None
20
+ # files: Optional[List[UploadFile]] = None
21
+
22
  app.add_middleware(
23
  CORSMiddleware,
24
  allow_origins=["*"],
 
27
  allow_headers=["*"],
28
  )
29
 
30
+ pipSqueezeBert = pipeline("question-answering", model="ALOQAS/squeezebert-uncased-finetuned-squad-v2")
31
+ pipBert = pipeline('question-answering', model="ALOQAS/bert-large-uncased-finetuned-squad-v2")
32
+ pipDeberta = pipeline('question-answering', model="ALOQAS/deberta-large-finetuned-squad-v2")
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  @app.get("/")
35
  async def root():
36
  return {"message": "Hello World"}
37
 
38
+ @app.post("/contextText/")
39
+ async def create_upload_file(request: Request):
40
+ try:
41
+ if request.model == "squeezebert":
42
+ answer = squeezebert(request.context, request.question, pipSqueezeBert)
43
+ elif request.model == "bert":
44
+ answer = bert(request.context, request.question, pipBert)
45
+ elif request.model == "deberta":
46
+ answer = deberta(request.context, request.question, pipDeberta)
47
+ else:
48
+ raise HTTPException(status_code=400, detail="Model not found.")
49
+ return answer
50
+ except Exception as e:
51
+ raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
52
 
53
  @app.post("/uploadfile/")
54
+ async def create_upload_file(files: List[UploadFile] = File(...), question: str = Form(...), model: str = Form(...)):
55
+ res = ""
56
  for file in files:
57
+ try:
58
+ res += await file_to_text(file)
59
+ except Exception as e:
60
+ print(f"Failed to process file {file.filename}: {e}")
61
+ continue
62
+
63
+ if res == "":
64
+ raise HTTPException(status_code=400, detail="All files failed to process.")
65
+
66
+ answer = None
67
+ if model == "squeezebert":
68
+ answer = squeezebert(res, question, pipSqueezeBert)
69
+ elif model == "bert":
70
+ answer = bert(res, question, pipBert)
71
+ elif model == "deberta":
72
+ answer = deberta(res, question, pipDeberta)
73
+ else:
74
+ raise HTTPException(status_code=400, detail="Model not found.")
75
+
76
+ return answer
77
 
78
  @app.post("/squeezebert/")
79
+ async def qasqueezebert(request: Request):
80
  try:
81
+ squeezebert_answer = squeezebert(request.context, request.question, pipSqueezeBert)
82
  if squeezebert_answer:
83
  return squeezebert_answer
84
  else:
 
87
  raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
88
 
89
  @app.post("/bert/")
90
+ async def qabert(request: Request):
91
  try:
92
  bert_answer = bert(request.context, request.question, pipBert)
93
  if bert_answer:
 
97
  except Exception as e:
98
  raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
99
 
100
+ @app.post("/deberta/")
101
+ async def qadeberta(request: Request):
102
  try:
103
  deberta_answer = deberta(request.context, request.question, pipDeberta)
104
  if deberta_answer:
 
107
  raise HTTPException(status_code=404, detail="No answer found")
108
  except Exception as e:
109
  raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modeles.py CHANGED
@@ -1,54 +1,5 @@
1
- from transformers import AutoTokenizer, AutoModelForQuestionAnswering
2
- import torch
3
-
4
- def loadSqueeze():
5
- tokenizer = AutoTokenizer.from_pretrained("ALOQAS/squeezebert-uncased-finetuned-squad-v2")
6
- model = AutoModelForQuestionAnswering.from_pretrained("ALOQAS/squeezebert-uncased-finetuned-squad-v2")
7
- return tokenizer, model
8
-
9
- def squeezebert(context, question, model, tokenizer):
10
- # Tokenize the input question-context pair
11
- inputs = tokenizer.encode_plus(question, context, max_length=512, truncation=True, padding=True, return_tensors='pt')
12
-
13
- # Send inputs to the same device as your model
14
- inputs = {k: v.to(model.device) for k, v in inputs.items()}
15
-
16
- with torch.no_grad():
17
- # Forward pass, get model outputs
18
- outputs = model(**inputs)
19
-
20
- # Extract the start and end positions of the answer in the tokens
21
- answer_start_scores, answer_end_scores = outputs.start_logits, outputs.end_logits
22
-
23
- # Calculate probabilities from logits
24
- answer_start_prob = torch.softmax(answer_start_scores, dim=-1)
25
- answer_end_prob = torch.softmax(answer_end_scores, dim=-1)
26
-
27
- # Find the most likely start and end positions
28
- answer_start_index = torch.argmax(answer_start_prob) # Most likely start of answer
29
- answer_end_index = torch.argmax(answer_end_prob) + 1 # Most likely end of answer; +1 for inclusive slicing
30
-
31
- # Extract the highest probability scores
32
- start_score = answer_start_prob.max().item() # Highest probability of start
33
- end_score = answer_end_prob.max().item() # Highest probability of end
34
-
35
- # Combine the scores into a singular score
36
- combined_score = (start_score * end_score) ** 0.5 # Geometric mean of start and end scores
37
-
38
- # Convert token indices to the actual answer text
39
- answer_tokens = inputs['input_ids'][0, answer_start_index:answer_end_index]
40
- answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
41
-
42
- # Return the answer, its positions, and the combined score
43
- return {
44
- "answer": answer,
45
- "start": answer_start_index.item(),
46
- "end": answer_end_index.item(),
47
- "score": combined_score
48
- }
49
-
50
-
51
-
52
 
53
  def bert(context, question, pip):
54
  return pip(context=context, question=question)
 
1
+ def squeezebert(context, question, pip):
2
+ return pip(context=context, question=question)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  def bert(context, question, pip):
5
  return pip(context=context, question=question)
requirements.txt CHANGED
@@ -13,4 +13,5 @@ chardet
13
  frontend
14
  typing
15
  torch
16
- pydantic
 
 
13
  frontend
14
  typing
15
  torch
16
+ pydantic
17
+ PyPDF2
uploadFile.py CHANGED
@@ -2,11 +2,11 @@ import chardet
2
  from fastapi import UploadFile, HTTPException
3
  from io import BytesIO
4
  from docx import Document
5
- import fitz
6
 
7
  async def file_to_text(file: UploadFile):
8
  file_extension = file.filename.split('.')[-1].lower()
9
-
10
  if file_extension == 'csv':
11
  csv_data = await file.read()
12
  encoding = chardet.detect(csv_data)['encoding']
@@ -16,12 +16,12 @@ async def file_to_text(file: UploadFile):
16
  except UnicodeDecodeError:
17
  raise HTTPException(status_code=400, detail="Le fichier CSV contient des caractères qui ne peuvent pas être décodés.")
18
 
19
- # Fait
20
  elif file_extension == 'json':
21
  json_data = await file.read()
22
  return json_data.decode()
23
 
24
- # Fait
25
  elif file_extension == 'docx':
26
  doc_data = await file.read()
27
  # Utilisez un flux mémoire pour passer les données au Document
@@ -29,23 +29,21 @@ async def file_to_text(file: UploadFile):
29
  doc = Document(doc_stream)
30
  doc_text = [paragraph.text for paragraph in doc.paragraphs]
31
  return '\n'.join(doc_text)
32
-
33
- # Fait
34
  elif file_extension == 'txt':
35
  txt_data = await file.read()
36
  return txt_data.decode()
37
-
38
  # Fait
39
  elif file_extension == 'pdf':
40
  try:
41
  pdf_data = await file.read()
42
  # Chargez les données binaires dans un objet fitz.Document
43
- pdf_document = fitz.open("pdf", pdf_data)
44
- text = ''
45
- # Extrait le texte de chaque page
46
- for page in pdf_document:
47
- text += page.get_text()
48
- pdf_document.close()
49
  return text
50
  except Exception as e:
51
  raise HTTPException(status_code=500, detail=f"Erreur de lecture du fichier PDF : {e}")
 
2
  from fastapi import UploadFile, HTTPException
3
  from io import BytesIO
4
  from docx import Document
5
+ import PyPDF2
6
 
7
  async def file_to_text(file: UploadFile):
8
  file_extension = file.filename.split('.')[-1].lower()
9
+ # Fait
10
  if file_extension == 'csv':
11
  csv_data = await file.read()
12
  encoding = chardet.detect(csv_data)['encoding']
 
16
  except UnicodeDecodeError:
17
  raise HTTPException(status_code=400, detail="Le fichier CSV contient des caractères qui ne peuvent pas être décodés.")
18
 
19
+ # Fait
20
  elif file_extension == 'json':
21
  json_data = await file.read()
22
  return json_data.decode()
23
 
24
+ # Fait
25
  elif file_extension == 'docx':
26
  doc_data = await file.read()
27
  # Utilisez un flux mémoire pour passer les données au Document
 
29
  doc = Document(doc_stream)
30
  doc_text = [paragraph.text for paragraph in doc.paragraphs]
31
  return '\n'.join(doc_text)
32
+
33
+ # Fait
34
  elif file_extension == 'txt':
35
  txt_data = await file.read()
36
  return txt_data.decode()
37
+
38
  # Fait
39
  elif file_extension == 'pdf':
40
  try:
41
  pdf_data = await file.read()
42
  # Chargez les données binaires dans un objet fitz.Document
43
+ pdf_document = PyPDF2.PdfReader(BytesIO(pdf_data))
44
+ text = ""
45
+ for page_number in range(len(pdf_document.pages)):
46
+ text += pdf_document.pages[page_number].extract_text()
 
 
47
  return text
48
  except Exception as e:
49
  raise HTTPException(status_code=500, detail=f"Erreur de lecture du fichier PDF : {e}")