Building a Stock Price Prediction Model with KNN
A tutorial showing how to build a stock price prediction model with the use of the K-Nearset Neighbor Algorithm
- Importing dependecies and Apple dataset
- Calculating the Moving Average
- Visualizing the data price & M.avg
- Calculating & Visualizing the Daily Returns
- Bulding the portfolio
- Calculating the portfolio correlation
- Predicting Stocks Price
- Pre-processing & Cross Validation
import pandas as pd
from datetime import datetime
import pandas_datareader.data as web
from pandas import Series, DataFrame
start_date = "2010-01-01"
today = datetime.today().strftime("%Y-%m-%d")
df = web.DataReader("AAPL", 'yahoo', start = start_date, end = today)
df.tail()
df["M.Avg"]= df["Adj Close"].rolling(window=100).mean()
df.tail()
import matplotlib.pyplot as plt
plt.style.use("seaborn")
import matplotlib as mpl
mpl.rc('figure', figsize=(16, 8))
df[["Adj Close","M.Avg"]].plot()
plt.show()
returns = df["Adj Close"].pct_change()
returns= pd.DataFrame(returns)
returns
returns.plot()
dfcomp = web.DataReader(['AAPL','AMZN','NFLX','GOOG','MSFT','FB'], 'yahoo', start = start_date, end = today)["Adj Close"]
dfcomp
title = 'portfolio Adj. close price history'
#Create and plot the graph
for c in dfcomp.columns.values:
plt.plot(dfcomp[c], label = c)
#plt.plot(dfcomp[["AAPL","AMZN","NFLX","FB","MSFT"]])
plt.title(title)
plt.xlabel("Date", fontsize= 16)
plt.ylabel("Adj.price USD ($)", fontsize= 16)
plt.legend(dfcomp.columns.values, loc='upper left')
plt.show()
pfrets = dfcomp.pct_change()
pfcorr = pfrets.corr()
pfcorr
plt.scatter(pfrets.GOOG, pfrets.AMZN)
plt.xlabel('Returns GOOG')
plt.ylabel('Returns NFLX')
from pandas.plotting import scatter_matrix
scatter_matrix(pfrets, diagonal='kde', figsize=(10, 10))
plt.imshow(pfcorr, cmap='hot', interpolation='none')
plt.colorbar()
plt.xticks(range(len(pfcorr)), pfcorr.columns)
plt.yticks(range(len(pfcorr)), pfcorr.columns);
plt.scatter(pfrets.mean(), pfrets.std())
plt.xlabel('Expected returns')
plt.ylabel('Risk')
for label, x, y in zip(pfrets.columns, pfrets.mean(), pfrets.std()):
plt.annotate(
label,
xy = (x, y), xytext = (20, -20),
textcoords = 'offset points', ha = 'right', va = 'bottom',
bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
dfreg = df[['Adj Close','Volume']]
dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0
dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0
dfreg
Pre-processing & Cross Validation
We will clean up and process the data using the following steps before putting them into the prediction models:
- Drop missing value
- Separating the label here, we want to predict the AdjClose
- Scale the X so that everyone can have the same distribution for linear regression
- Finally We want to find Data Series of late X and early X (train) for model generation and evaluation Separate label and identify it as y
- Separation of training and testing of model by cross validation train test split
import numpy as np
import math
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
#Drop missing value
dfreg.fillna(value= -99999, inplace=True)
#Create a variable for predicting 'n' days out into the future
forecast_out = int(math.ceil(len(dfreg)* 0.01))
#Create another column (the target or the dependent variable) shifted 'n' units up
dfreg['Prediction']= df[['Adj Close']].shift(-forecast_out)
X = np.array(dfreg.drop(['Prediction'],1))
X = preprocessing.scale(X)
X = X [:-forecast_out]
X
x_forecast = X[-forecast_out:]
x_forecast
y = np.array(dfreg['Prediction'])[:-forecast_out]
y
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
from sklearn.neighbors import KNeighborsRegressor
clfknn = KNeighborsRegressor(n_neighbors=2)
clfknn.fit(X_train, y_train)
confidenceknn = clfknn.score(X_test, y_test)
print("The KNN regression confidence is: ", confidenceknn)
forecast_set = clfknn.predict(x_forecast)
#Create an empty column for the forecast result
print(forecast_set)
predictions = forecast_set
valid = df[X.shape[0]:]
valid["Prediction"] = predictions
plt.figure(figsize=(16,8))
plt.title("Apple")
plt.xlabel("Days")
plt.ylabel("Adj. Close USD ($)")
plt.plot(df["Adj Close"])
plt.plot(valid[["Adj Close","Prediction"]])
plt.legend(["Orig","Valid","Pred"])
plt.show()