|
from datetime import datetime, timedelta |
|
import pandas as pd |
|
import numpy as np |
|
|
|
from sklearn.ensemble import RandomForestRegressor |
|
from sklearn.metrics import mean_squared_error |
|
from math import sqrt |
|
from sklearn.preprocessing import MinMaxScaler |
|
import model_utils as mu |
|
|
|
def model_run(df_all): |
|
""" Prediciton function that runs random forest model and predicts tomorrow cryptocurrency price""" |
|
|
|
first_day_future=pd.to_datetime(datetime.now()+timedelta(days=1)) |
|
|
|
reframed_lags, df_final=mu.data_transform(df_all, first_day_future) |
|
print(f'I have transformed the dataset into the frame for supervised learning') |
|
reframed_lags.to_csv('reframed_lags.csv') |
|
|
|
""" Randomly split a chunk into train test based on train/test ratio (0.8) and split the other chunks for all the other currencies in the same fashion""" |
|
import random |
|
train_size=0.8 |
|
|
|
df_cut1=reframed_lags.reset_index().iloc[:,1:] |
|
print('tady') |
|
train_value=int(len(df_cut1)*train_size) |
|
first_random=random.sample(range(len(df_cut1)-1), train_value) |
|
train_bulk=np.sort(first_random) |
|
|
|
df_cut=reframed_lags.reset_index() |
|
train_sample=df_cut.loc[df_cut['index'].isin(train_bulk)] |
|
test_sample=df_cut.loc[~df_cut['index'].isin(train_bulk)] |
|
|
|
test=test_sample.iloc[:,1:] |
|
train=train_sample.iloc[:,1:] |
|
print(f'I have split the dataset into training and testing samples') |
|
|
|
|
|
|
|
|
|
scaler_train = MinMaxScaler(feature_range=(0, 1)) |
|
scaled = scaler_train.fit_transform(train.values.astype('float32')) |
|
df_train=pd.DataFrame(scaled) |
|
df_train.columns=train.columns |
|
|
|
|
|
scaler_test = MinMaxScaler(feature_range=(0, 1)) |
|
scaled = scaler_test.fit_transform(test.values.astype('float32')) |
|
df_test=pd.DataFrame(scaled) |
|
df_test.columns=test.columns |
|
|
|
|
|
|
|
|
|
train_features=df_train.values |
|
test_features=df_test.values |
|
|
|
train_labels = df_train['prices'].values |
|
test_labels = df_test['prices'].values |
|
|
|
|
|
baseline_preds = pd.DataFrame(test_features).iloc[:,0] |
|
|
|
baseline_errors = abs(baseline_preds - test_labels) |
|
|
|
|
|
from sklearn.ensemble import RandomForestRegressor |
|
|
|
rf = RandomForestRegressor(n_estimators= 1000) |
|
rf.fit(train_features, train_labels) |
|
prediction_rf = rf.predict(test_features) |
|
predictions=prediction_rf |
|
|
|
|
|
|
|
df_test['prices']=predictions |
|
prediction_transformed=pd.DataFrame(scaler_test.inverse_transform(df_test.values.astype('float'))) |
|
prediction_transformed.columns=test.columns |
|
|
|
|
|
df_test.loc[df_test.index==(len(df_test)-1),'prices']=predictions[-1:][0] |
|
inv_transformed=pd.DataFrame(scaler_test.inverse_transform(df_test.values.astype('float'))) |
|
inv_transformed.columns=test.columns |
|
|
|
|
|
df_with_forecast=df_final.copy() |
|
df_with_forecast.loc[df_with_forecast.index==df_with_forecast.index[-1],'prices']=inv_transformed['prices'][-1:].values[0] |
|
print('Final result') |
|
print(df_with_forecast) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(reframed_lags)>500: |
|
train_size=0.9 |
|
elif len(reframed_lags)>200: |
|
train_size=0.8 |
|
else: |
|
train_size=0.7 |
|
predictions=[] |
|
test_labels_all=[] |
|
window_length=int((len(reframed_lags)-len(reframed_lags)*train_size)) |
|
for i in range(0,window_length): |
|
train_accuracy=reframed_lags.iloc[0:int(len(reframed_lags)*train_size)+i,:] |
|
test_accuracy=reframed_lags.iloc[len(train_accuracy):len(train_accuracy)+1,:] |
|
train_features_accuracy=train_accuracy.drop(columns='prices') |
|
test_features_accuracy=test_accuracy.drop(columns='prices') |
|
train_labels_accuracy=train_accuracy['prices'] |
|
test_labels_accuracy=test_accuracy['prices'] |
|
|
|
rf = RandomForestRegressor(n_estimators= 1000) |
|
rf.fit(train_features_accuracy, train_labels_accuracy) |
|
prediction_rf = rf.predict(test_features_accuracy) |
|
predictions=np.append(predictions,prediction_rf) |
|
test_labels_all=np.append(test_labels_all,test_labels_accuracy) |
|
|
|
|
|
from sklearn.metrics import r2_score |
|
accuracy=r2_score(predictions,test_labels_all) |
|
result_rf=pd.DataFrame({'prediction':predictions,'data':test_labels_all}) |
|
result_rf.to_csv('result_rf.csv') |
|
return df_with_forecast, accuracy, result_rf |