Spaces:
Sleeping
Sleeping
File size: 7,979 Bytes
e2af017 96bff79 e2af017 e727bfc 63c894a e727bfc 535c2d9 63c894a 535c2d9 a16cb6b 31810c1 f2de8aa 63c894a a16cb6b 535c2d9 f2de8aa 96bff79 e727bfc 96bff79 e2af017 02b222c e2af017 2b9a5fa fa81cfc 2b9a5fa 49f17d6 2b9a5fa e2af017 2b9a5fa e2af017 2b9a5fa e2af017 02b222c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import streamlit as st
import pandas as pd
from transformers import BartForConditionalGeneration, TapexTokenizer, T5ForConditionalGeneration, T5Tokenizer
from prophet import Prophet
import datetime
import sentencepiece as spm
# Caminho para o arquivo CSS, ajuste conforme a estrutura do seu projeto
css_file = "style.css"
# Abrindo e lendo o arquivo CSS
with open(css_file, "r") as css:
css_style = css.read()
# Markdown combinado com a importação da fonte e o HTML
html_content = f"""
<style>
{css_style}
@import url('https://fonts.googleapis.com/css2?family=Kanit:wght@700&display=swap');
</style>
<div style='display: flex; flex-direction: column; align-items: flex-start;'>
<div style='display: flex; align-items: center;'>
<div style='width: 20px; height: 40px; background-color: green; margin-right: 1px;'></div>
<div style='width: 20px; height: 40px; background-color: red; margin-right: 1px;'></div>
<div style='width: 20px; height: 40px; background-color: yellow; margin-right: 20px;'></div>
<span style='font-size: 50px; font-weight: normal; font-family: "Kanit", sans-serif;'><strong>PROTAX</strong></span>
</div>
<div style='text-align: left; width: 100%;'>
<span style='font-size: 20px; font-weight: normal; color: #333; font-family: "Kanit", sans-serif'>
<strong>PRO</strong>phet & <strong>TA</strong>pex E<strong>X</strong>plorer</span>
</div>
</div>
"""
# Aplicar o markdown combinado no Streamlit
st.markdown(html_content, unsafe_allow_html=True)
# File upload interface
uploaded_file = st.file_uploader("Carregue um arquivo CSV ou XLSX", type=['csv', 'xlsx'])
if uploaded_file:
if 'all_anomalies' not in st.session_state:
with st.spinner('Aplicando modelo de série temporal...'):
# Load the file into a DataFrame
if uploaded_file.name.endswith('.csv'):
df = pd.read_csv(uploaded_file, quotechar='"', encoding='utf-8')
elif uploaded_file.name.endswith('.xlsx'):
df = pd.read_excel(uploaded_file)
# Data preprocessing for Prophet
new_df = df.iloc[2:, 9:-1].fillna(0)
new_df.columns = df.iloc[1, 9:-1]
new_df.columns = new_df.columns.str.replace(r" \(\d+\)", "", regex=True)
month_dict = {
'Jan': '01', 'Fev': '02', 'Mar': '03', 'Abr': '04',
'Mai': '05', 'Jun': '06', 'Jul': '07', 'Ago': '08',
'Set': '09', 'Out': '10', 'Nov': '11', 'Dez': '12'
}
def convert_column_name(column_name):
if column_name == 'Rótulos de Linha':
return column_name
parts = column_name.split('/')
month = parts[0].strip()
year = parts[1].strip()
year = ''.join(filter(str.isdigit, year))
month_number = month_dict.get(month, '00')
return f"{month_number}/{year}"
new_df.columns = [convert_column_name(col) for col in new_df.columns]
new_df.columns = pd.to_datetime(new_df.columns, errors='coerce')
new_df.rename(columns={new_df.columns[0]: 'Rotulo'}, inplace=True)
df_clean = new_df.copy()
# Create an empty DataFrame to store all anomalies
all_anomalies = pd.DataFrame()
# Process each row in the DataFrame
for index, row in df_clean.iterrows():
data = pd.DataFrame({
'ds': [col for col in df_clean.columns if isinstance(col, pd.Timestamp)],
'y': row[[isinstance(col, pd.Timestamp) for col in df_clean.columns]].values
})
data = data[data['y'] > 0].reset_index(drop=True)
if data.empty or len(data) < 2:
print(f"Skipping group {row['Rotulo']} because there are less than 2 non-zero observations.")
continue
try:
model = Prophet(interval_width=0.95)
model.fit(data)
except ValueError as e:
print(f"Skipping group {row['Rotulo']} due to error: {e}")
continue
future = model.make_future_dataframe(periods=12, freq='M')
forecast = model.predict(future)
num_real = len(data)
num_forecast = len(forecast)
real_values = list(data['y']) + [None] * (num_forecast - num_real)
forecast['real'] = real_values
anomalies = forecast[(forecast['real'] < forecast['yhat_lower']) | (forecast['real'] > forecast['yhat_upper'])]
anomalies['Group'] = row['Rotulo']
all_anomalies = pd.concat([all_anomalies, anomalies[['ds', 'real', 'Group']]], ignore_index=True)
# Store the result in session state
all_anomalies.rename(columns={"ds": "datetime", "real": "monetary value", "Group": "group"}, inplace=True)
all_anomalies = all_anomalies[all_anomalies['monetary value'].astype('float') >= 10,000,000.00]
all_anomalies['monetary value'] = all_anomalies['monetary value'].apply(lambda x: f"{x:.2f}")
all_anomalies.sort_values(by=['monetary value'], ascending=False, inplace=True)
all_anomalies = all_anomalies.fillna('').astype(str)
st.session_state['all_anomalies'] = all_anomalies
# Load translation models
pt_en_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-pt-en-t5")
en_pt_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-en-pt-t5")
tokenizer = T5Tokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5")
# Load TAPEX model
tapex_model = BartForConditionalGeneration.from_pretrained("microsoft/tapex-large-finetuned-wtq")
tapex_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
def translate(text, model, tokenizer, source_lang="pt", target_lang="en"):
input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
outputs = model.generate(input_ids)
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return translated_text
def response(user_question, table_data):
question_en = translate(user_question, pt_en_translator, tokenizer, source_lang="pt", target_lang="en")
encoding = tapex_tokenizer(table=table_data, query=[question_en], padding=True, return_tensors="pt", truncation=True)
outputs = tapex_model.generate(**encoding)
response_en = tapex_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
response_pt = translate(response_en, en_pt_translator, tokenizer, source_lang="en", target_lang="pt")
return response_pt
# Streamlit interface
st.dataframe(st.session_state['all_anomalies'].head())
# Chat history
if 'history' not in st.session_state:
st.session_state['history'] = []
user_question = st.text_input("Escreva sua questão aqui:", "")
if user_question:
st.session_state['history'].append(('👤', user_question))
st.markdown(f"**👤 {user_question}**")
bot_response = response(user_question, st.session_state['all_anomalies'])
st.session_state['history'].append(('🤖', bot_response))
st.markdown(f"<div style='text-align: right'>**🤖 {bot_response}**</div>", unsafe_allow_html=True)
if st.button("Limpar"):
st.session_state['history'] = []
for sender, message in st.session_state['history']:
if sender == '👤':
st.markdown(f"**👤 {message}**")
elif sender == '🤖':
st.markdown(f"<div style='text-align: right'>**🤖 {message}**</div>", unsafe_allow_html=True)
else:
st.warning("Por favor, carregue um arquivo CSV ou XLSX para começar.")
|