# !pip install statsmodels == 0.12.1

# !pip install pmdarima

# Version check 
import statsmodels

statsmodels.__version__

'0.12.1'

# Libraries to do data manipulation
import numpy as np

import pandas as pd

# Library to do data visualization
import matplotlib.pyplot as plt

# Library to do time series decomposition
import statsmodels.api as sm

# Module to create ACF and PACF plots
from statsmodels.graphics import tsaplots

# Module to build AR, MA, ARMA, and ARIMA models
from statsmodels.tsa.arima.model import ARIMA

# Module to implement MSE and RSME during model evaluation
from sklearn.metrics import mean_squared_error

# Code for ignoring unnecessary warnings while executing some code  
import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv('Crude Oil Production by Country.csv')

data.head()

# Using loc and index = 2 to fetch the data for United States from the original dataset
united_states = data.loc[0]

# Dropping the variable country, as we only need the time and production information to build the model
united_states = pd.DataFrame(united_states).drop(['Country'])

# Fetching the two columns - YEAR and OIL PRODUCTION
united_states = united_states.reset_index()

united_states.columns = ['YEAR', 'OIL PRODUCTION']

# Converting the data type for variable OIL PRODUCTION to integer
united_states['OIL PRODUCTION'] = united_states['OIL PRODUCTION'].astype(int)

# Converting the YEAR column data type to datetime
united_states['YEAR'] = pd.to_datetime(united_states['YEAR'])

# Setting the variable YEAR as the index of this dataframe
united_states = united_states.set_index('YEAR')

# Checking the time series crude oil production data for United States
united_states.head()

ax = united_states.plot(color = 'blue', figsize = (16, 8))

ax.set_title('Yearly crude oil production by United States')

plt.show()

# Using seasonal_decompose function to decompose the time series into its individual components
decomposition = sm.tsa.seasonal_decompose(united_states)

# Creating an empty dataframe to store the individual components
decomposed_data = pd.DataFrame()

# Extracting the trend component of time series
decomposed_data['trend'] = decomposition.trend

# Extracting the seasonal component of time series
decomposed_data['seasonal'] = decomposition.seasonal

# Extracting the white noise or residual component of time series
decomposed_data['random_noise'] = decomposition.resid

fig, (ax1, ax2, ax3) = plt.subplots(nrows = 3, ncols = 1, figsize = (20, 16))

decomposed_data['trend'].plot(ax = ax1)

decomposed_data['seasonal'].plot(ax = ax2)

decomposed_data['random_noise'].plot(ax = ax3)

<AxesSubplot:xlabel='YEAR'>

# Using the first 20 years data as the training data
train_data = united_states.loc['1992-01-01' : '2012-01-01']

# Using the last 7 years data as the test data
test_data = united_states.loc['2012-01-01':]

# Creating a subplot space
fig, ax = plt.subplots(figsize = (16, 6))

# Plotting train data
train_data.plot(ax = ax)

# Plotting test data
test_data.plot(ax = ax)

# Adding the legends in sequential order
plt.legend(['train data', 'test data'])

# Showing the time which divides the original data into train and test
plt.axvline(x = '2012-01-01', color = 'black', linestyle = '--')

# Showing the plot
plt.show()

# Importing ADF test from statsmodels package
from statsmodels.tsa.stattools import adfuller

# Implementing ADF test on the original time series data
result = adfuller(train_data['OIL PRODUCTION'])

# Printing the results
print(result[0])

print(result[1]) # To get the p-value

print(result[4])

-0.5829098523091641
0.8747971281795595
{'1%': -4.01203360058309, '5%': -3.1041838775510207, '10%': -2.6909873469387753}

# Implementing ADF test on the original time series data
result = adfuller(train_data['OIL PRODUCTION'])

fig, ax = plt.subplots(figsize = (16, 6))

train_data.plot(ax = ax)

plt.show()

# Printing the results

print('ADF Statistic:', result[0])

print('p-value:', result[1])

ADF Statistic: -0.5829098523091641
p-value: 0.8747971281795595

# Taking the 1st order differencing of the timeseries
train_data_stationary = train_data.diff().dropna()

# Implementing ADF test on the first order differenced time series data
result = adfuller(train_data_stationary['OIL PRODUCTION'])

fig, ax = plt.subplots(figsize = (16, 6))

train_data_stationary.plot(ax = ax)

plt.show()

# Printing the results

print('ADF Statistic:', result[0])

print('p-value:', result[1])

ADF Statistic: 1.575800707060134
p-value: 0.9977831288888281

# Taking the 2nd order differencing of the time series
train_data_stationary = train_data.diff().diff().dropna()

# Implementing ADF test on the second order differenced time series data
result = adfuller(train_data_stationary['OIL PRODUCTION'])

fig, ax = plt.subplots(figsize = (16, 6))

train_data_stationary.plot(ax = ax)

plt.show()

# Printing the results

print('ADF Statistic:', result[0])

print('p-value:', result[1])

ADF Statistic: -1.5580506601256086
p-value: 0.504624368911218

# Taking the 3rd order differencing of the time series
train_data_stationary = train_data.diff().diff().diff().dropna()

# Implementing ADF test on the second order differenced time series data
result = adfuller(train_data_stationary['OIL PRODUCTION'])

fig, ax = plt.subplots(figsize = (16, 6))

train_data_stationary.plot(ax = ax)

plt.show()

# Printing the results

print('ADF Statistic:', result[0])

print('p-value:', result[1])

ADF Statistic: -6.191607528895948
p-value: 6.10365022487616e-08

# Creating two subplots to show ACF and PACF plots
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (16, 6))

# Creating and plotting the ACF charts starting from lag = 1
tsaplots.plot_acf(train_data_stationary, zero = False, ax = ax1)

# Creating and plotting the ACF charts starting from lag = 1 till lag = 8
tsaplots.plot_pacf(train_data_stationary, zero = False, ax = ax2, lags = 8)

plt.show()

# We are using the ARIMA function to build the AR model, so we need to pass the stationary time series that we got after double 
# differencing the original time series. Also, we will keep the q parameter as 0, so that the model acts as an AR model

# Creating an AR model with parameter p = 1
ar_1_model = ARIMA(train_data_stationary, order = (1, 0, 0))

# Creating an AR model with parameter p = 2
ar_2_model = ARIMA(train_data_stationary, order = (2, 0, 0))

# Creating an AR model with parameter p = 3
ar_3_model = ARIMA(train_data_stationary, order = (3, 0, 0))

# Creating an AR model with parameter p = 4
ar_4_model = ARIMA(train_data_stationary, order = (4, 0, 0))

# Fitting all the models that we implemented in the above cell

ar_1_results = ar_1_model.fit()

ar_2_results = ar_2_model.fit()

ar_3_results = ar_3_model.fit()

ar_4_results = ar_4_model.fit()

def plot_predicted_output(results, ax):
    
    # We are taking double cumulative sum of forecasted values (which is inverse of double differencing)
    # And we are also adding the last element of the training data to the forecasted values to get back to the original scale
    predictions = np.cumsum(np.cumsum(results.predict(start = 19, end = 25))) + train_data.iloc[-1][0]
    
    # Setting indices of the test data into prediction values
    predictions.index = test_data.index
    
    # Computing the AIC and RMSE metrics for the model and printing it into title of the plot
    train_data.plot(ax = ax, label = 'train', 
                    title = 'AIC: {}'.format(np.round(results.aic, 2)) + 
                           ' , ' +
                           'RMSE: {}'.format(np.round(np.sqrt(mean_squared_error(test_data, predictions)), 2)))
    
    # Plotting the test data
    test_data.plot(ax = ax) 
    
    # Plotting the forecasted data
    predictions.plot(ax = ax)
    
    # Adding the legends sequentially
    ax.legend(['train data', 'test data', 'forecasted values'])

# Plotting the forecasted values along with train and test for all the models

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows = 2, ncols = 2, figsize = (20, 10))

plot_predicted_output(ar_1_results, ax1)

plot_predicted_output(ar_2_results, ax2)

plot_predicted_output(ar_3_results, ax3)

plot_predicted_output(ar_4_results, ax4)

plt.show()

ar_4_results.summary()

# We are using the ARIMA function to build the MA model, so we need to pass the stationary time series that we got after double 
# differencing the original time series. Also, we will keep the p parameter as 0 so that the model acts as an MA model

# Creating MA model with parameter q = 1
ma_1_model = ARIMA(train_data_stationary, order = (0, 0, 1))

# Creating MA model with parameter q = 2
ma_2_model = ARIMA(train_data_stationary, order = (0, 0, 2))

# Creating MA model with parameter q = 3
ma_3_model = ARIMA(train_data_stationary, order = (0, 0, 3))

# Creating MA model with parameter q = 4
ma_4_model = ARIMA(train_data_stationary, order = (0, 0, 4))

# Fitting all the models that we implemented in the above cell

ma_1_results = ma_1_model.fit()

ma_2_results = ma_2_model.fit()

ma_3_results = ma_3_model.fit()

ma_4_results = ma_4_model.fit()

# Plotting the forecasted values along with train and test for all the models

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows = 2, ncols = 2, figsize = (20, 10))

plot_predicted_output(ma_1_results, ax1)

plot_predicted_output(ma_2_results, ax2)

plot_predicted_output(ma_3_results, ax3)

plot_predicted_output(ma_4_results, ax4)

plt.show()

ma_2_results.summary()

# We are using the ARIMA function here, so we need to pass stationary time series that we got after double differencing the 
# original time series

# Creating an ARMA model with parameters p = 2 and q = 1
ar_2_ma_1_model = ARIMA(train_data_stationary, order = (2, 0, 1))

# Creating an ARMA model with parameters p = 2 and q = 2
ar_2_ma_2_model = ARIMA(train_data_stationary, order=(2, 0, 2))

# Creating an ARMA model with parameters p = 3 and q = 2
ar_3_ma_2_model = ARIMA(train_data_stationary, order = (3, 0, 2))

# Creating an ARMA model with parameters p = 2 and q = 3
ar_2_ma_3_model = ARIMA(train_data_stationary, order = (2, 0, 3))

# Fitting all the models that we implemented in the above cell

ar_2_ma_1_results = ar_2_ma_1_model.fit()

ar_2_ma_2_results = ar_2_ma_2_model.fit()

ar_3_ma_2_results = ar_3_ma_2_model.fit()

ar_2_ma_3_results = ar_2_ma_3_model.fit()

# Plotting the forecasted values along with train and test for all the models

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows = 2, ncols = 2, figsize = (20, 10))

plot_predicted_output(ar_2_ma_1_results, ax1)

plot_predicted_output(ar_2_ma_2_results, ax2)

plot_predicted_output(ar_3_ma_2_results, ax3)

plot_predicted_output(ar_2_ma_3_results, ax4)

plt.show()

ar_2_ma_1_results.summary()

train_data = train_data.astype('float32')

# Creating an ARIMA model with parameters p = 2, d = 3 and q = 1
ar_2_d_3_ma_1_model = ARIMA(train_data, order = (2, 3, 1))

# Creating an ARIMA model with parameters p = 1, d = 3 and q = 2
ar_1_d_3_ma_2_model = ARIMA(train_data, order = (1, 3, 2))

# Creating an ARIMA model with parameters p = 2, d = 3 and q = 2
ar_2_d_3_ma_2_model = ARIMA(train_data, order = (2, 3, 2))

# Creating an ARIMA model with parameters p = 3, d = 3 and q = 2
ar_3_d_3_ma_2_model = ARIMA(train_data, order = (3, 3, 2))

# Fitting all the models that we implemented in the above cell

ar_2_d_3_ma_1_results = ar_2_d_3_ma_1_model.fit()

ar_1_d_3_ma_2_results = ar_1_d_3_ma_2_model.fit()

ar_2_d_3_ma_2_results = ar_2_d_3_ma_2_model.fit()

ar_3_d_3_ma_2_results = ar_3_d_3_ma_2_model.fit()

def plot_predicted_output_new(results, ax):
    
    predictions = results.predict(start = 19, end = 25)
    
    # Setting indices of the test data into prediction values
    predictions.index = test_data.index
    
    # Computing the AIC and RMSE metrics for the model and printing it into title of the plot
    train_data.plot(ax = ax, label = 'train', 
                    
                    title = 'AIC: {}'.format(np.round(results.aic, 2)) + 
                           ' , ' +
                           'RMSE: {}'.format(np.round(np.sqrt(mean_squared_error(test_data, predictions)), 2)))
    
    # Plotting the test data
    test_data.plot(ax = ax) 
    
    # Plotting the forecasted data
    predictions.plot(ax = ax)
    
    # Adding the legends sequentially
    ax.legend(['train data', 'test data', 'forecasted values'])

# Plotting the forecasted values along with train and test for all the models

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows = 2, ncols = 2, figsize = (20, 10))

plot_predicted_output_new(ar_2_d_3_ma_1_results, ax1)

plot_predicted_output_new(ar_1_d_3_ma_2_results, ax2)

plot_predicted_output_new(ar_2_d_3_ma_2_results, ax3)

plot_predicted_output_new(ar_3_d_3_ma_2_results, ax4)

plt.show()

ar_2_d_3_ma_2_results.summary()

final_model = ARIMA(united_states, order = (2, 3, 2))

final_model_results = final_model.fit()

forecasted_ARIMA = final_model_results.predict(start = '2019-01-01', end = '2025-01-01')

# Plotting the original time seris with forecast

plt.figure(figsize = (16, 8))

plt.plot(united_states, color = 'c', label = 'Original Series')

plt.plot(forecasted_ARIMA, label = 'Forecasted Series', color = 'b')

plt.title('Actual vs Predicted')

plt.legend()

plt.show()

import pmdarima as pm

auto_arima_model = pm.auto_arima(train_data, d = 3, seasonal = False, trace = True, 
                                 error_action = 'ignore', suppress_warnings = True)

print(auto_arima_model.summary())

Performing stepwise search to minimize aic
 ARIMA(2,3,2)(0,0,0)[0]             : AIC=inf, Time=0.07 sec
 ARIMA(0,3,0)(0,0,0)[0]             : AIC=258.092, Time=0.01 sec
 ARIMA(1,3,0)(0,0,0)[0]             : AIC=257.788, Time=0.01 sec
 ARIMA(0,3,1)(0,0,0)[0]             : AIC=252.229, Time=0.03 sec
 ARIMA(1,3,1)(0,0,0)[0]             : AIC=254.063, Time=0.04 sec
 ARIMA(0,3,2)(0,0,0)[0]             : AIC=252.880, Time=0.03 sec
 ARIMA(1,3,2)(0,0,0)[0]             : AIC=inf, Time=0.09 sec
 ARIMA(0,3,1)(0,0,0)[0] intercept   : AIC=inf, Time=0.02 sec

Best model:  ARIMA(0,3,1)(0,0,0)[0]          
Total fit time: 0.292 seconds
                               SARIMAX Results                                
==============================================================================
Dep. Variable:                      y   No. Observations:                   21
Model:               SARIMAX(0, 3, 1)   Log Likelihood                -124.115
Date:                Thu, 28 Apr 2022   AIC                            252.229
Time:                        19:21:19   BIC                            254.010
Sample:                             0   HQIC                           252.475
                                 - 21                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ma.L1         -0.8398      0.297     -2.828      0.005      -1.422      -0.258
sigma2      4.983e+04    1.2e+04      4.160      0.000    2.64e+04    7.33e+04
===================================================================================
Ljung-Box (L1) (Q):                   0.87   Jarque-Bera (JB):                 3.09
Prob(Q):                              0.35   Prob(JB):                         0.21
Heteroskedasticity (H):               3.02   Skew:                             0.97
Prob(H) (two-sided):                  0.20   Kurtosis:                         3.62
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

fig = plt.figure(figsize = (16, 8))

fig = auto_arima_model.plot_diagnostics(fig = fig)

	Country	1992	1993	1994	1995	1996	1997	1998	1999	2000	...	2009	2010	2011	2012	2013	2014	2015	2016	2017	2018
0	United States	7171	6847	6662	6560	6465	6451	6252	5881	5822	...	5349	5478	5654	6502.0	7467.0	8759.0	9431.0	8831.0	9352.0	10962.0
1	Saudi Arabia	8332	8198	8120	8231	8218	8362	8389	7833	8404	...	8250	8900	9458	9832.0	9693.0	9735.0	10168.0	10461.0	10134.0	10425.0
2	Russia	7632	6730	6135	5995	5850	5920	5854	6079	6479	...	9495	9694	9774	9922.0	10054.0	10107.0	10253.0	10551.0	10580.0	10759.0
3	Canada	1605	1679	1746	1805	1837	1922	1981	1907	1977	...	2579	2741	2901	3138.0	3325.0	3613.0	3677.0	3679.0	3977.0	4264.0
4	Iraq	425	512	553	560	579	1155	2150	2508	2571	...	2391	2399	2626	2983.0	3054.0	3368.0	4045.0	4444.0	4454.0	4613.0

	OIL PRODUCTION
YEAR
1992-01-01	7171
1993-01-01	6847
1994-01-01	6662
1995-01-01	6560
1996-01-01	6465

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	8.0306	17.221	0.466	0.641	-25.722	41.783
ar.L1	-0.8563	0.371	-2.308	0.021	-1.583	-0.129
ar.L2	-1.0208	0.534	-1.911	0.056	-2.067	0.026
ar.L3	-0.3271	0.410	-0.799	0.425	-1.130	0.476
ar.L4	-0.4350	0.340	-1.280	0.201	-1.101	0.231
sigma2	3.249e+04	2.03e+04	1.600	0.110	-7311.529	7.23e+04

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	10.9137	12.362	0.883	0.377	-13.315	35.143
ma.L1	-1.7168	36.087	-0.048	0.962	-72.446	69.013
ma.L2	0.9979	41.885	0.024	0.981	-81.095	83.091
sigma2	3.386e+04	1.4e+06	0.024	0.981	-2.72e+06	2.78e+06

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	8.4304	35.159	0.240	0.810	-60.479	77.340
ar.L1	-1.2892	0.178	-7.253	0.000	-1.638	-0.941
ar.L2	-0.8171	0.250	-3.264	0.001	-1.308	-0.326
ma.L1	0.9882	3.208	0.308	0.758	-5.300	7.276
sigma2	3.58e+04	1.07e+05	0.333	0.739	-1.75e+05	2.46e+05

USA Crude Oil Production Forecast (1992 - 2018)¶

Context¶

Objective¶

Data Dictionary¶

Importing necessary libraries¶

Loading the dataset¶

Visualizing the time series and decomposing it¶

Splitting the dataset¶

Checking for stationarity¶

ACF and PACF Plots¶

Evaluation Metrics¶

AR Modeling¶

MA Modeling¶

ARMA Modeling¶

ARIMA Modeling¶

Conclusion¶

Additional Model - Auto ARIMA¶

Dep. Variable:	OIL PRODUCTION	No. Observations:	18
Model:	ARIMA(4, 0, 0)	Log Likelihood	-120.495
Date:	Thu, 28 Apr 2022	AIC	252.990
Time:	19:21:14	BIC	258.332
Sample:	01-01-1995	HQIC	253.727
	- 01-01-2012
Covariance Type:	opg

Ljung-Box (L1) (Q):	0.57	Jarque-Bera (JB):	0.37
Prob(Q):	0.45	Prob(JB):	0.83
Heteroskedasticity (H):	2.22	Skew:	0.10
Prob(H) (two-sided):	0.36	Kurtosis:	2.33

Ljung-Box (L1) (Q):	0.38	Jarque-Bera (JB):	0.84
Prob(Q):	0.54	Prob(JB):	0.66
Heteroskedasticity (H):	1.59	Skew:	0.51
Prob(H) (two-sided):	0.59	Kurtosis:	3.30

Ljung-Box (L1) (Q):	1.47	Jarque-Bera (JB):	0.37
Prob(Q):	0.23	Prob(JB):	0.83
Heteroskedasticity (H):	1.76	Skew:	0.12
Prob(H) (two-sided):	0.51	Kurtosis:	2.34

Ljung-Box (L1) (Q):	0.67	Jarque-Bera (JB):	0.91
Prob(Q):	0.41	Prob(JB):	0.64
Heteroskedasticity (H):	1.85	Skew:	-0.10
Prob(H) (two-sided):	0.47	Kurtosis:	1.92

	coef	std err	z	P>\|z\|	[0.025	0.975]
ar.L1	-0.9701	0.276	-3.518	0.000	-1.511	-0.430
ar.L2	-0.5351	0.318	-1.685	0.092	-1.158	0.087
ma.L1	0.2168	7.403	0.029	0.977	-14.293	14.726
ma.L2	-0.7778	6.047	-0.129	0.898	-12.630	11.074
sigma2	2.577e+04	1.97e+05	0.131	0.896	-3.61e+05	4.12e+05