DuyTa's picture
Upload folder using huggingface_hub
74b1bac verified
raw
history blame
4.72 kB
import logging
from langchain_core.documents import Document
from typing import List
from langchain_core.output_parsers import BaseOutputParser
from qdrant_client import QdrantClient, models
import requests
# from langchain_cohere import CohereRerank
# def format_docs(docs: List[Document]) -> str:
# """Convert Documents to a single string."""
# formatted = [
# f"Article Content: {doc.metadata['Header 1']}"
# + (f" - {doc.metadata['Header 2']}" if 'Header 2' in doc.metadata and doc.metadata['Header 2'] else "")
# + (f" - {doc.metadata['Header 3']}" if 'Header 3' in doc.metadata and doc.metadata['Header 3'] else "")
# + f"\n{doc.page_content}"
# for doc in docs
# ]
# return "\n" + "\n".join(formatted)
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
class LineListOutputParser(BaseOutputParser[List[str]]):
"""Output parser for a list of lines."""
def parse(self, text: str) -> List[str]:
lines = text.strip().split("\n")
return list(filter(None, lines)) # Remove empty lines
def extract_metadata(docs, headers=('Header_1', 'Header_2', 'Header_3')):
meta_data_docs = []
for doc in docs:
meta_data_doc = [doc.metadata[header] for header in headers if doc.metadata.get(header)]
meta_data_docs.append(meta_data_doc)
return meta_data_docs
def search_with_filter(query, vector_store, k, headers):
conditions = []
# Xử lý điều kiện theo số lượng headers
if len(headers) == 1:
conditions.append(
models.FieldCondition(
key="metadata.Header_1",
match=models.MatchValue(
value=headers[0]
),
)
)
elif len(headers) == 2:
conditions.append(
models.FieldCondition(
key="metadata.Header_1",
match=models.MatchValue(
value=headers[0]
),
)
)
conditions.append(
models.FieldCondition(
key="metadata.Header_2",
match=models.MatchValue(
value=headers[1]
),
)
)
elif len(headers) == 3:
conditions.append(
models.FieldCondition(
key="metadata.Header_1",
match=models.MatchValue(
value=headers[0]
),
)
)
conditions.append(
models.FieldCondition(
key="metadata.Header_2",
match=models.MatchValue(
value=headers[1]
),
)
)
conditions.append(
models.FieldCondition(
key="metadata.Header_3",
match=models.MatchValue(
value=headers[2]
),
)
)
# Thực hiện truy vấn với các điều kiện
single_result = vector_store.similarity_search(
query=query,
k=k,
filter=models.Filter(
must=conditions
),
)
return single_result
def get_relevant_documents(documents: List[Document], limit: int) -> List[Document]:
result = []
seen = set()
for doc in documents:
if doc.page_content not in seen:
result.append(doc)
seen.add(doc.page_content)
if len(result) == limit:
break
return result
def translate(text: str) -> str:
url = "https://translate.googleapis.com/translate_a/single"
params = {
"client": "gtx",
"sl": "vi",
"tl": "en",
"dt": "t",
"q": text,
}
try:
r = requests.get(
url, params=params, timeout=10
) # Add timeout for robustness
r.raise_for_status()
result = r.json()
translated_text = "".join([sentence[0] for sentence in result[0]])
return translated_text
except requests.exceptions.RequestException as e:
error_msg = f"Translation API error: {str(e)}"
logging.error(error_msg)
return f"{text}\n\n[Translation failed: {error_msg}]"
except Exception as e:
error_msg = f"Unexpected error during translation: {str(e)}"
logging.exception(error_msg) # Log traceback for unexpected errors
return f"{text}\n\n[Translation failed: {error_msg}]"