Spaces:
Running
Running
File size: 7,627 Bytes
e2af017 96bff79 e2af017 96bff79 63c894a 535c2d9 63c894a 535c2d9 a16cb6b f2de8aa 63c894a a16cb6b 535c2d9 f2de8aa 96bff79 e2af017 02b222c e2af017 2b9a5fa fa81cfc 2b9a5fa 49f17d6 2b9a5fa e2af017 2b9a5fa e2af017 2b9a5fa e2af017 02b222c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import streamlit as st
import pandas as pd
from transformers import BartForConditionalGeneration, TapexTokenizer, T5ForConditionalGeneration, T5Tokenizer
from prophet import Prophet
import datetime
import sentencepiece as spm
st.markdown("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Kanit:wght@700&display=swap');
</style>
<div style='display: flex; flex-direction: column; align-items: flex-start;'>
<div style='display: flex; align-items: center;'>
<div style='width: 20px; height: 40px; background-color: green; margin-right: 1px;'></div>
<div style='width: 20px; height: 40px; background-color: red; margin-right: 1px;'></div>
<div style='width: 20px; height: 40px; background-color: yellow; margin-right: 20px;'></div>
<span style='font-size: 40px; font-weight: normal; font-family: "Kanit", sans-serif;'>PROTAX</span>
</div>
<div style='text-align: left; width: 100%;'>
<span style='font-size: 20px; font-weight: normal; color: #333; font-family: "Kanit", sans-serif'>
<strong>PRO</strong>phet & <strong>TA</strong>pex E<strong>X</strong>plorer</span>
</div>
</div>
""", unsafe_allow_html=True)
# File upload interface
uploaded_file = st.file_uploader("Carregue um arquivo CSV ou XLSX", type=['csv', 'xlsx'])
if uploaded_file:
if 'all_anomalies' not in st.session_state:
with st.spinner('Aplicando modelo de série temporal...'):
# Load the file into a DataFrame
if uploaded_file.name.endswith('.csv'):
df = pd.read_csv(uploaded_file, quotechar='"', encoding='utf-8')
elif uploaded_file.name.endswith('.xlsx'):
df = pd.read_excel(uploaded_file)
# Data preprocessing for Prophet
new_df = df.iloc[2:, 9:-1].fillna(0)
new_df.columns = df.iloc[1, 9:-1]
new_df.columns = new_df.columns.str.replace(r" \(\d+\)", "", regex=True)
month_dict = {
'Jan': '01', 'Fev': '02', 'Mar': '03', 'Abr': '04',
'Mai': '05', 'Jun': '06', 'Jul': '07', 'Ago': '08',
'Set': '09', 'Out': '10', 'Nov': '11', 'Dez': '12'
}
def convert_column_name(column_name):
if column_name == 'Rótulos de Linha':
return column_name
parts = column_name.split('/')
month = parts[0].strip()
year = parts[1].strip()
year = ''.join(filter(str.isdigit, year))
month_number = month_dict.get(month, '00')
return f"{month_number}/{year}"
new_df.columns = [convert_column_name(col) for col in new_df.columns]
new_df.columns = pd.to_datetime(new_df.columns, errors='coerce')
new_df.rename(columns={new_df.columns[0]: 'Rotulo'}, inplace=True)
df_clean = new_df.copy()
# Create an empty DataFrame to store all anomalies
all_anomalies = pd.DataFrame()
# Process each row in the DataFrame
for index, row in df_clean.iterrows():
data = pd.DataFrame({
'ds': [col for col in df_clean.columns if isinstance(col, pd.Timestamp)],
'y': row[[isinstance(col, pd.Timestamp) for col in df_clean.columns]].values
})
data = data[data['y'] > 0].reset_index(drop=True)
if data.empty or len(data) < 2:
print(f"Skipping group {row['Rotulo']} because there are less than 2 non-zero observations.")
continue
try:
model = Prophet(interval_width=0.95)
model.fit(data)
except ValueError as e:
print(f"Skipping group {row['Rotulo']} due to error: {e}")
continue
future = model.make_future_dataframe(periods=12, freq='M')
forecast = model.predict(future)
num_real = len(data)
num_forecast = len(forecast)
real_values = list(data['y']) + [None] * (num_forecast - num_real)
forecast['real'] = real_values
anomalies = forecast[(forecast['real'] < forecast['yhat_lower']) | (forecast['real'] > forecast['yhat_upper'])]
anomalies['Group'] = row['Rotulo']
all_anomalies = pd.concat([all_anomalies, anomalies[['ds', 'real', 'Group']]], ignore_index=True)
# Store the result in session state
all_anomalies.rename(columns={"ds": "datetime", "real": "monetary value", "Group": "group"}, inplace=True)
all_anomalies = all_anomalies[all_anomalies['monetary value'].astype('float') >= 10,000,000.00]
all_anomalies['monetary value'] = all_anomalies['monetary value'].apply(lambda x: f"{x:.2f}")
all_anomalies.sort_values(by=['monetary value'], ascending=False, inplace=True)
all_anomalies = all_anomalies.fillna('').astype(str)
st.session_state['all_anomalies'] = all_anomalies
# Load translation models
pt_en_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-pt-en-t5")
en_pt_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-en-pt-t5")
tokenizer = T5Tokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5")
# Load TAPEX model
tapex_model = BartForConditionalGeneration.from_pretrained("microsoft/tapex-large-finetuned-wtq")
tapex_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
def translate(text, model, tokenizer, source_lang="pt", target_lang="en"):
input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
outputs = model.generate(input_ids)
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return translated_text
def response(user_question, table_data):
question_en = translate(user_question, pt_en_translator, tokenizer, source_lang="pt", target_lang="en")
encoding = tapex_tokenizer(table=table_data, query=[question_en], padding=True, return_tensors="pt", truncation=True)
outputs = tapex_model.generate(**encoding)
response_en = tapex_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
response_pt = translate(response_en, en_pt_translator, tokenizer, source_lang="en", target_lang="pt")
return response_pt
# Streamlit interface
st.dataframe(st.session_state['all_anomalies'].head())
# Chat history
if 'history' not in st.session_state:
st.session_state['history'] = []
user_question = st.text_input("Escreva sua questão aqui:", "")
if user_question:
st.session_state['history'].append(('👤', user_question))
st.markdown(f"**👤 {user_question}**")
bot_response = response(user_question, st.session_state['all_anomalies'])
st.session_state['history'].append(('🤖', bot_response))
st.markdown(f"<div style='text-align: right'>**🤖 {bot_response}**</div>", unsafe_allow_html=True)
if st.button("Limpar"):
st.session_state['history'] = []
for sender, message in st.session_state['history']:
if sender == '👤':
st.markdown(f"**👤 {message}**")
elif sender == '🤖':
st.markdown(f"<div style='text-align: right'>**🤖 {message}**</div>", unsafe_allow_html=True)
else:
st.warning("Por favor, carregue um arquivo CSV ou XLSX para começar.")
|