nijoow commited on
Commit
9b75908
ยท
1 Parent(s): 6934747

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -31
app.py CHANGED
@@ -27,41 +27,50 @@ def get_pdf_text(pdf_docs):
27
 
28
  # ๊ณผ์ œ
29
  # ์•„๋ž˜ ํ…์ŠคํŠธ ์ถ”์ถœ ํ•จ์ˆ˜๋ฅผ ์ž‘์„ฑ
30
- def get_text_file(docs):
31
- text_content = ""
32
-
33
- # Hugging Face ๋ชจ๋ธ์„ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค. (์˜ˆ์‹œ๋กœ 'bert-base-uncased' ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค)
34
- model_name = 'bert-base-uncased'
35
- model = HuggingFaceEmbeddings(model_name)
36
-
37
- # ํ…์ŠคํŠธ ์ถ”์ถœ์„ ์œ„ํ•ด ๋ฌธ์„œ๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
38
- text_loader = TextLoader(docs.getvalue().decode('utf-8'))
39
- document = text_loader.load()
40
-
41
  import csv
42
  import json
43
- from tempfile import NamedTemporaryFile
44
-
45
- def get_csv_file(docs):
46
- with NamedTemporaryFile(delete=False) as temp_file:
47
- temp_file.write(docs.getvalue().encode('utf-8'))
48
- temp_file.seek(0)
49
-
50
- csv_data = []
51
- csv_reader = csv.reader(temp_file)
52
- for row in csv_reader:
53
- csv_data.append(row)
54
-
55
- return csv_data
56
-
57
- def get_json_file(docs):
58
- with NamedTemporaryFile(delete=False) as temp_file:
59
- temp_file.write(docs.getvalue().encode('utf-8'))
60
- temp_file.seek(0)
61
 
62
- json_data = json.load(temp_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- return json_data
 
 
 
 
 
 
 
 
65
 
66
 
67
 
 
27
 
28
  # ๊ณผ์ œ
29
  # ์•„๋ž˜ ํ…์ŠคํŠธ ์ถ”์ถœ ํ•จ์ˆ˜๋ฅผ ์ž‘์„ฑ
 
 
 
 
 
 
 
 
 
 
 
30
  import csv
31
  import json
32
+ from PyPDF2 import PdfReader
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ def get_text_from_document(doc_path):
35
+ # ๋ฌธ์„œ์˜ ํ™•์žฅ์ž์— ๋”ฐ๋ผ ๋‹ค๋ฅธ ์ฒ˜๋ฆฌ๋ฅผ ์ˆ˜ํ–‰
36
+ if doc_path.endswith('.txt'):
37
+ # ํ…์ŠคํŠธ ํŒŒ์ผ์˜ ๊ฒฝ์šฐ
38
+ with open(doc_path, 'r', encoding='utf-8') as file:
39
+ text = file.read()
40
+ return text
41
+ elif doc_path.endswith('.csv'):
42
+ # CSV ํŒŒ์ผ์˜ ๊ฒฝ์šฐ
43
+ with open(doc_path, 'r', encoding='utf-8') as file:
44
+ csv_reader = csv.reader(file)
45
+ text = '\n'.join(','.join(row) for row in csv_reader)
46
+ return text
47
+ elif doc_path.endswith('.json'):
48
+ # JSON ํŒŒ์ผ์˜ ๊ฒฝ์šฐ
49
+ with open(doc_path, 'r', encoding='utf-8') as file:
50
+ data = json.load(file)
51
+ text = json.dumps(data, indent=2)
52
+ return text
53
+ elif doc_path.endswith('.pdf'):
54
+ # PDF ํŒŒ์ผ์˜ ๊ฒฝ์šฐ
55
+ with open(doc_path, 'rb') as file:
56
+ pdf_reader = PdfReader(file)
57
+ text = ''
58
+ for page_num in range(len(pdf_reader.pages)):
59
+ text += pdf_reader.pages[page_num].extract_text()
60
+ return text
61
+ else:
62
+ # ์ง€์›ํ•˜์ง€ ์•Š๋Š” ํ™•์žฅ์ž์ธ ๊ฒฝ์šฐ
63
+ raise ValueError(f"Unsupported file extension: {doc_path}")
64
 
65
+ def get_text_file(docs):
66
+ # ๊ฐ ๋ฌธ์„œ์— ๋Œ€ํ•œ ์ฒ˜๋ฆฌ๋ฅผ ์ˆ˜ํ–‰
67
+ for doc in docs:
68
+ try:
69
+ text = get_text_from_document(doc)
70
+ # ์—ฌ๊ธฐ์—์„œ text๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์ถ”๊ฐ€์ ์ธ ์ „์ฒ˜๋ฆฌ ๋กœ์ง์„ ์ˆ˜ํ–‰ํ•  ์ˆ˜ ์žˆ์Œ
71
+ print(f"Text from {doc}:\n{text}\n")
72
+ except ValueError as e:
73
+ print(f"Error processing {doc}: {e}")
74
 
75
 
76