Krypto1 / app_crypto_rf_model.py
KatGaw's picture
adding new reddit group
05a3e2c
raw
history blame
5.41 kB
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import MinMaxScaler
import model_utils as mu
def model_run(df_all):
""" Prediciton function that runs random forest model and predicts tomorrow cryptocurrency price"""
first_day_future=pd.to_datetime(datetime.now()+timedelta(days=1))
#----------------------------------------- DATASET MANIPULATION FOR SUPERVISED LEARNING --------------------------------------------
reframed_lags, df_final=mu.data_transform(df_all, first_day_future)
print(f'I have transformed the dataset into the frame for supervised learning')
reframed_lags.to_csv('reframed_lags.csv')
#----------------------------------------- TRAIN/TEST SPLIT ------------------------------------------------------
""" Randomly split a chunk into train test based on train/test ratio (0.8) and split the other chunks for all the other currencies in the same fashion"""
import random
train_size=0.8
df_cut1=reframed_lags.reset_index().iloc[:,1:]
print('tady')
train_value=int(len(df_cut1)*train_size)
first_random=random.sample(range(len(df_cut1)-1), train_value)
train_bulk=np.sort(first_random) #make sure all the consequent ones have the same random numbers
df_cut=reframed_lags.reset_index()
train_sample=df_cut.loc[df_cut['index'].isin(train_bulk)]
test_sample=df_cut.loc[~df_cut['index'].isin(train_bulk)]
test=test_sample.iloc[:,1:]
train=train_sample.iloc[:,1:]
print(f'I have split the dataset into training and testing samples')
#----------------------------------- Re-Scale for supervised learning
# TRAIN RESCALE
# normalize features for the supervised learning (0,1)
scaler_train = MinMaxScaler(feature_range=(0, 1))
scaled = scaler_train.fit_transform(train.values.astype('float32'))
df_train=pd.DataFrame(scaled)
df_train.columns=train.columns #rename columns
# TEST RESCALE
scaler_test = MinMaxScaler(feature_range=(0, 1))
scaled = scaler_test.fit_transform(test.values.astype('float32'))
df_test=pd.DataFrame(scaled)
df_test.columns=test.columns #rename columns
#----------------------------------- MODEL
#define features
train_features=df_train.values
test_features=df_test.values
#define labels
train_labels = df_train['prices'].values
test_labels = df_test['prices'].values
#define baseline prediction (as last values) for evaluating prediction accuracy
baseline_preds = pd.DataFrame(test_features).iloc[:,0]
# Calculate errors for the baseline prediction
baseline_errors = abs(baseline_preds - test_labels)
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators= 1000)
rf.fit(train_features, train_labels)
prediction_rf = rf.predict(test_features)
predictions=prediction_rf
#----------------------------------- MODEL OUTPUT TRANSFORMATION
#Convert test column
df_test['prices']=predictions
prediction_transformed=pd.DataFrame(scaler_test.inverse_transform(df_test.values.astype('float')))
prediction_transformed.columns=test.columns
#Convert prediction
df_test.loc[df_test.index==(len(df_test)-1),'prices']=predictions[-1:][0]
inv_transformed=pd.DataFrame(scaler_test.inverse_transform(df_test.values.astype('float')))
inv_transformed.columns=test.columns
# data with forecast
df_with_forecast=df_final.copy()
df_with_forecast.loc[df_with_forecast.index==df_with_forecast.index[-1],'prices']=inv_transformed['prices'][-1:].values[0]
print('Final result')
print(df_with_forecast)
#----------------------------------- MODEL ACCURACY
#Calculate accuracy after transformation!!!
#get rid of values below 0.01 which skew the accuracy measure if in denominator
#Rolling window accuracy measure
if len(reframed_lags)>500:
train_size=0.9
elif len(reframed_lags)>200:
train_size=0.8
else:
train_size=0.7
predictions=[]
test_labels_all=[]
window_length=int((len(reframed_lags)-len(reframed_lags)*train_size))
for i in range(0,window_length):
train_accuracy=reframed_lags.iloc[0:int(len(reframed_lags)*train_size)+i,:]
test_accuracy=reframed_lags.iloc[len(train_accuracy):len(train_accuracy)+1,:]
train_features_accuracy=train_accuracy.drop(columns='prices')
test_features_accuracy=test_accuracy.drop(columns='prices')
train_labels_accuracy=train_accuracy['prices']
test_labels_accuracy=test_accuracy['prices']
rf = RandomForestRegressor(n_estimators= 1000)
rf.fit(train_features_accuracy, train_labels_accuracy)
prediction_rf = rf.predict(test_features_accuracy)
predictions=np.append(predictions,prediction_rf)
test_labels_all=np.append(test_labels_all,test_labels_accuracy)
#Calculate accuracy
from sklearn.metrics import r2_score
accuracy=r2_score(predictions,test_labels_all)
result_rf=pd.DataFrame({'prediction':predictions,'data':test_labels_all})
result_rf.to_csv('result_rf.csv')
return df_with_forecast, accuracy, result_rf