Spaces:

KatGaw
/

Krypto1

Sleeping

App Files Files Community

Krypto1 / app_crypto_rf_model.py

KatGaw

adding new reddit group

05a3e2c 4 months ago

raw

history blame

5.41 kB

	from datetime import datetime, timedelta
	import pandas as pd
	import numpy as np

	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import mean_squared_error
	from math import sqrt
	from sklearn.preprocessing import MinMaxScaler
	import model_utils as mu

	def model_run(df_all):
	""" Prediciton function that runs random forest model and predicts tomorrow cryptocurrency price"""

	first_day_future=pd.to_datetime(datetime.now()+timedelta(days=1))
	#----------------------------------------- DATASET MANIPULATION FOR SUPERVISED LEARNING --------------------------------------------
	reframed_lags, df_final=mu.data_transform(df_all, first_day_future)
	print(f'I have transformed the dataset into the frame for supervised learning')
	reframed_lags.to_csv('reframed_lags.csv')
	#----------------------------------------- TRAIN/TEST SPLIT ------------------------------------------------------
	""" Randomly split a chunk into train test based on train/test ratio (0.8) and split the other chunks for all the other currencies in the same fashion"""
	import random
	train_size=0.8

	df_cut1=reframed_lags.reset_index().iloc[:,1:]
	print('tady')
	train_value=int(len(df_cut1)*train_size)
	first_random=random.sample(range(len(df_cut1)-1), train_value)
	train_bulk=np.sort(first_random) #make sure all the consequent ones have the same random numbers

	df_cut=reframed_lags.reset_index()
	train_sample=df_cut.loc[df_cut['index'].isin(train_bulk)]
	test_sample=df_cut.loc[~df_cut['index'].isin(train_bulk)]

	test=test_sample.iloc[:,1:]
	train=train_sample.iloc[:,1:]
	print(f'I have split the dataset into training and testing samples')

	#----------------------------------- Re-Scale for supervised learning
	# TRAIN RESCALE
	# normalize features for the supervised learning (0,1)
	scaler_train = MinMaxScaler(feature_range=(0, 1))
	scaled = scaler_train.fit_transform(train.values.astype('float32'))
	df_train=pd.DataFrame(scaled)
	df_train.columns=train.columns #rename columns

	# TEST RESCALE
	scaler_test = MinMaxScaler(feature_range=(0, 1))
	scaled = scaler_test.fit_transform(test.values.astype('float32'))
	df_test=pd.DataFrame(scaled)
	df_test.columns=test.columns #rename columns

	#----------------------------------- MODEL

	#define features
	train_features=df_train.values
	test_features=df_test.values
	#define labels
	train_labels = df_train['prices'].values
	test_labels = df_test['prices'].values

	#define baseline prediction (as last values) for evaluating prediction accuracy
	baseline_preds = pd.DataFrame(test_features).iloc[:,0]
	# Calculate errors for the baseline prediction
	baseline_errors = abs(baseline_preds - test_labels)

	# Import the model we are using
	from sklearn.ensemble import RandomForestRegressor
	# Instantiate model with 1000 decision trees
	rf = RandomForestRegressor(n_estimators= 1000)
	rf.fit(train_features, train_labels)
	prediction_rf = rf.predict(test_features)
	predictions=prediction_rf

	#----------------------------------- MODEL OUTPUT TRANSFORMATION
	#Convert test column
	df_test['prices']=predictions
	prediction_transformed=pd.DataFrame(scaler_test.inverse_transform(df_test.values.astype('float')))
	prediction_transformed.columns=test.columns

	#Convert prediction
	df_test.loc[df_test.index==(len(df_test)-1),'prices']=predictions[-1:][0]
	inv_transformed=pd.DataFrame(scaler_test.inverse_transform(df_test.values.astype('float')))
	inv_transformed.columns=test.columns

	# data with forecast
	df_with_forecast=df_final.copy()
	df_with_forecast.loc[df_with_forecast.index==df_with_forecast.index[-1],'prices']=inv_transformed['prices'][-1:].values[0]
	print('Final result')
	print(df_with_forecast)

	#----------------------------------- MODEL ACCURACY
	#Calculate accuracy after transformation!!!
	#get rid of values below 0.01 which skew the accuracy measure if in denominator

	#Rolling window accuracy measure
	if len(reframed_lags)>500:
	train_size=0.9
	elif len(reframed_lags)>200:
	train_size=0.8
	else:
	train_size=0.7
	predictions=[]
	test_labels_all=[]
	window_length=int((len(reframed_lags)-len(reframed_lags)*train_size))
	for i in range(0,window_length):
	train_accuracy=reframed_lags.iloc[0:int(len(reframed_lags)*train_size)+i,:]
	test_accuracy=reframed_lags.iloc[len(train_accuracy):len(train_accuracy)+1,:]
	train_features_accuracy=train_accuracy.drop(columns='prices')
	test_features_accuracy=test_accuracy.drop(columns='prices')
	train_labels_accuracy=train_accuracy['prices']
	test_labels_accuracy=test_accuracy['prices']

	rf = RandomForestRegressor(n_estimators= 1000)
	rf.fit(train_features_accuracy, train_labels_accuracy)
	prediction_rf = rf.predict(test_features_accuracy)
	predictions=np.append(predictions,prediction_rf)
	test_labels_all=np.append(test_labels_all,test_labels_accuracy)

	#Calculate accuracy
	from sklearn.metrics import r2_score
	accuracy=r2_score(predictions,test_labels_all)
	result_rf=pd.DataFrame({'prediction':predictions,'data':test_labels_all})
	result_rf.to_csv('result_rf.csv')
	return df_with_forecast, accuracy, result_rf