import streamlit as st import pandas as pd import numpy as np import re import streamlit as st from dotenv import load_dotenv from PyPDF2 import PdfReader from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter from langchain_experimental.text_splitter import SemanticChunker from langchain_community.embeddings import OpenAIEmbeddings from langchain_community.vectorstores import FAISS from langchain_community.chat_models import ChatOpenAI from langchain.llms import HuggingFaceHub from langchain import hub from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough from langchain_community.document_loaders import WebBaseLoader from langchain_core.prompts.prompt import PromptTemplate import altair as alt from session import set_partie_prenante import os from streamlit_vertical_slider import vertical_slider load_dotenv() def get_docs_from_website(urls): loader = WebBaseLoader(urls, header_template={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36', }) docs = loader.load() return docs def get_doc_chunks(docs): # Split the loaded data # text_splitter = RecursiveCharacterTextSplitter( # chunk_size=500, # chunk_overlap=100) text_splitter = SemanticChunker(OpenAIEmbeddings()) docs = text_splitter.split_documents(docs) return docs def disp_test(): chart_data = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) st.scatter_chart(chart_data) def get_vectorstore_from_docs(doc_chunks): embedding = OpenAIEmbeddings(model="text-embedding-3-large") vectorstore = FAISS.from_documents(documents=doc_chunks, embedding=embedding) return vectorstore def get_conversation_chain(vectorstore): llm = ChatOpenAI(model="gpt-4o",temperature=0.5, max_tokens=2048) retriever=vectorstore.as_retriever() prompt = hub.pull("rlm/rag-prompt") # Chain rag_chain = ( {"context": retriever , "question": RunnablePassthrough()} | prompt | llm ) return rag_chain # FILL THE PROMPT FOR THE QUESTION VARIABLE THAT WILL BE USED IN THE RAG PROMPT, ATTENTION NOT CONFUSE WITH THE RAG PROMPT def fill_promptQ_template(input_variables, template): prompt = PromptTemplate(input_variables=["BRAND_NAME","BRAND_DESCRIPTION"], template=template) return prompt.format(BRAND_NAME=input_variables["BRAND_NAME"], BRAND_DESCRIPTION=input_variables["BRAND_DESCRIPTION"]) template_extraction_PP = ''' Objectif : identifiez et proposez tout les noms de marques qui serviront comme partie prenante de la marque suivante pour développer un marketing de coopération (co-op marketing) Le nom de la marque de référence est le suivant : {BRAND_NAME} Son activité est la suivante : {BRAND_DESCRIPTION} TA REPONSE DOIT ETRE SOUS FORME DE LISTE DE NOMS DE MARQUES ''' #don't forget to add the input variables from the maim function def text_to_list(text): lines = text.replace("- ","").split('\n') lines = [line.split() for line in lines] items = [[' '.join(line[:-1]),line[-1]] for line in lines] # Assuming `items` is the list of items for item in items: item[1] = re.sub(r'\D', '', item[1]) return items def extract_pp(urls,input_variables): template_extraction_PP = ''' Objectif : identifiez et proposez tout les noms de marques qui serviront comme partie prenante de la marque suivante pour développer un marketing de coopération (co-op marketing) Le nom de la marque de référence est le suivant : {BRAND_NAME} Son activité est la suivante : {BRAND_DESCRIPTION} TA REPONSE DOIT ETRE SOUS FORME DE LISTE DE NOMS DE MARQUES ''' #don't forget to add the input variables from the maim function docs = get_docs_from_website(urls) #get text chunks text_chunks = get_doc_chunks(docs) #create vectorstore vectorstore = get_vectorstore_from_docs(text_chunks) chain = get_conversation_chain(vectorstore) question = fill_promptQ_template(input_variables, template_extraction_PP) response = chain.invoke(question) # version plus poussée a considérer # each item in the list is a list with the name of the brand and the similarity percentage #partie_prenante = text_to_list(response.content) #version simple partie_prenante = response.content.replace("- ","").split('\n') return partie_prenante def disp_vertical_slider(partie_prenante): number_of_sliders = len(partie_prenante) st.set_page_config(layout="wide") st.subheader("Vertical Slider") st.title("Vertical Slider") st.write("This is a vertical slider example") bar = st.columns(number_of_sliders) for i in range(number_of_sliders): with bar[i]: tst = vertical_slider( label=partie_prenante[i], height=100, key=partie_prenante[i], default_value=50, thumb_color= "orange", #Optional - Defaults to Streamlit Red step=1, min_value=0, max_value=100, value_always_visible=False, ) st.write(tst) def display_pp(): load_dotenv() st.header("INDIQUEZ VOS PAGES WEB ET/OU DOCUMENTS D’ENTREPRISE POUR AUDITER LE CONTENU RSE") loaded = False option = st.radio("Source", ("A partir de votre site web", "A partir de vos documents entreprise")) if option == "A partir de votre site web": url1 = st.text_input("URL 1") brand_name = st.text_input("Nom de la marque") brand_description = st.text_area("Description de la marque") if st.button("Process") and loaded == False: loaded = True with st.spinner("Processing..."): input_variables = {"BRAND_NAME": brand_name, "BRAND_DESCRIPTION": brand_description} partie_prenante = extract_pp([url1], input_variables) partie_prenante = sorted(partie_prenante) set_partie_prenante(partie_prenante) st.write(pd.DataFrame(partie_prenante, columns=["Partie prenante"])) # alphabet = [ pp[0] for pp in partie_prenante] # pouvoir = [ 50 for _ in range(len(partie_prenante))] # df = pd.DataFrame({'partie_prenante': partie_prenante, 'pouvoir': pouvoir, 'code couleur': partie_prenante}) # st.write(df) # c = ( # alt.Chart(df) # .mark_circle(size=300) # .encode(x="partie_prenante", y=alt.Y("pouvoir",scale=alt.Scale(domain=[0,100])), color="code couleur") # ) # st.subheader("Vertical Slider") # age = st.slider("How old are you?", 0, 130, 25) # st.write("I'm ", age, "years old") # disp_vertical_slider(partie_prenante) # st.altair_chart(c, use_container_width=True)