ODIN Project: Petroleum Weekly Stocks¶

No description has been provided for this image
In [268]:
import pymysql 
import pandas as pd 
import numpy as np 
import boto3 
import json 
import tensorflow as tf 
import pymysql 
import warnings 
import statsmodels.api as sm 
from sklearn import metrics
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler
from statsmodels.stats.diagnostic import acorr_ljungbox
from datetime import datetime 
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.seasonal import STL

import seaborn as sns 
import matplotlib.pyplot as plt 

from typing import List, Dict 
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

Load Petroleum Weekly Stocks¶

In [98]:
client = boto3.client('secretsmanager')
user,password,host,db,_ = list(json.loads(client.get_secret_value(SecretId='mysql_develoment').get('SecretString')).values())
con: 'MySQL' = pymysql.connect(user=user, passwd=password, db=db, host=host)

query: str = """ 
SELECT 
    %s
FROM petroleum_weekly_stocks p 
""" % (','.join(list(map(lambda item: 'p.%s' % (item), pd.read_sql('DESCRIBE petroleum_weekly_stocks', con=con ).iloc[1:-1, :1]['Field'].tolist() ))))

weekly_stocks: 'DataFrame' = pd.read_sql(query, con=con) 
weekly_stocks['period'] = pd.to_datetime(weekly_stocks['period'])
weekly_stocks['value'] = weekly_stocks['value'].astype(float) 
weekly_stocks['month_name'] = weekly_stocks['period'].apply(lambda row: row.month_name())
weekly_stocks['month'] = weekly_stocks['period'].apply(lambda row: row.month)

weekly_stocks.head() 
Out[98]:
index period duoarea area_name product product_name process process_name series series_description value month_name
0 0 2023-10-20 R40 PADD 4 EPD0 Distillate Fuel Oil SAE Ending Stocks WDISTP41 Rocky Mountain (PADD 4) Ending Stocks of Disti... 3715.0 October
1 1 2023-10-20 R30 PADD 3 EPLLPZ Propane and Propylene SAXP Ending Stocks Excluding Propylene at Terminal WPRSTP31 Gulf Coast (PADD 3) Propane and Propylene Endi... 59994.0 October
2 2 2023-10-20 R1Y PADD 1B EPLLPZ Propane and Propylene SAXP Ending Stocks Excluding Propylene at Terminal WPRST1B1 Central Atlantic (PADD 1B) Propane and Propyle... 6402.0 October
3 3 2023-10-20 R10 PADD 1 EPM0 Total Gasoline SAE Ending Stocks WGTSTP11 East Coast (PADD 1) Ending Stocks of Total Gas... 56065.0 October
4 4 2023-10-20 R40 PADD 4 EPM0CO Other Conventional Motor Gasoline SAE Ending Stocks WG6ST_R40_1 Rocky Mountain (PADD 4) Ending Stocks of Other... 1111.0 October

Value in MBBL Descriptions¶

In [68]:
pd.DataFrame( weekly_stocks['value'].describe()).rename({'value': 'MBBL'}, axis=1)
Out[68]:
MBBL
count 4.970000e+03
mean 4.800995e+04
std 1.783345e+05
min 0.000000e+00
25% 3.842500e+02
50% 4.945000e+03
75% 2.637200e+04
max 1.629265e+06

Petroleum Stocks by PADD Districts and Process Name¶

In [135]:
axis = sns.catplot(data=weekly_stocks[~weekly_stocks['area_name'].isna()], col='area_name', y='value', x='process_name', kind='box', sharey=False )
for ax in axis.axes.ravel():
    ax.set_title(ax.get_title().replace("area_name =", "" ).strip(), fontweight='bold', fontsize=16)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontweight='bold' )
    ax.set_ylabel("MBBL", fontweight='bold')
No description has been provided for this image

Histogram Weekly Petroleum Stocks by Process Name¶

In [149]:
fig = plt.figure(figsize=(20,10)) 

for index,process_name in enumerate(weekly_stocks['process_name'].unique().tolist(), 1):
    ax= fig.add_subplot(3,2,index)
    weekly_stocks.query(f"process_name == '{process_name}' ").plot(kind='hist', by='process_name', column='value',  ax=ax)
    ax.set_title("Histogram: %s" % (process_name), fontweight='bold', fontsize=12 )

plt.tight_layout() 
No description has been provided for this image

Distribution of Petroleum by Product Name in MBBL¶

In [75]:
fig = plt.figure(figsize=(20,10)) 

for index,process_name in enumerate(weekly_stocks['process_name'].unique().tolist(),1):
    
    ax = fig.add_subplot(3,2,index)
    current_df: 'DataFrame' = weekly_stocks.query(f"process_name == '{process_name}' ")['value'] 
    sns.distplot(current_df, ax=ax)
    ax.set_title("Distribution of Pertroleum By %s in MBBL" % (process_name) , fontweight='bold', fontsize=16)
    ymin,ymax = ax.get_ylim()

    mean: float = current_df.mean() 
    median: float = current_df.median() 
    ax.vlines(x=mean, ymin=ymin, ymax=ymax, label=f"Mean: {mean:.2f}", linestyle="-.")
    ax.vlines(x=median, ymin=ymin, ymax=ymax, label=f"Median: {median:.2f}", linestyle="--", color='black')
    ax.legend()
    
plt.tight_layout() 
No description has been provided for this image

Weekly Trends in Petroleum Stocks by Product Name¶

In [90]:
fig = plt.figure(figsize=(50,25))
for index,product_name in enumerate( weekly_stocks['product_name'].unique().tolist() ,1 ):
    ax = fig.add_subplot(6,5,index)
    weekly_stocks.query(f"product_name == '{product_name}' ").groupby('period')['value'].mean().plot(ax=ax, mec='black', marker='s')
    ax.set_title("Trends in Weekly %s " % (product_name), fontweight='bold', fontsize=16)
    ax.set_ylabel("MBBL", fontweight='bold')

plt.tight_layout()
No description has been provided for this image

Weekly Crude Oil and Petroleum Stocks by Month¶

In [162]:
weekly_stocks[ weekly_stocks['product_name'].apply(lambda row: 'crude' in row.lower()) ].groupby(['product', 'month'])['value'].mean().unstack('month').rename(dict(map(lambda item: (item[0],item[1]), weekly_stocks[['month','month_name']].drop_duplicates().sort_values(by='month').values.tolist())) ,axis=1)
Out[162]:
month March April May June July August September October
product
EP00 1.477518e+06 1417923.00 1417271.25 1438666.20 1447800.125 1438442.5 1441043.80 1.445540e+06
EPC0 2.618143e+05 216559.65 214249.10 212346.72 209326.825 203913.8 197959.44 1.993029e+05

Modeling Weekly Petroleum Stocks¶

In [177]:
crude_oils: 'DataFrame' = weekly_stocks[ weekly_stocks['product_name'].apply(lambda row: 'crude' in row.lower()) ]
co_stocks: 'DataFrame' = pd.DataFrame(crude_oils.groupby('period')['value'].mean())
fig = plt.figure() 
ax = fig.add_subplot()
co_stocks.plot(figsize=(12,4), ax=ax, marker='s', mec='black' )
ax.set_title("Weekly Petroleum Stocks", fontweight='bold', fontsize=12)

plt.tight_layout() 
No description has been provided for this image

Modeling Crude Oil Weekly Stocks using LSTM Neural Networks¶

In [224]:
scaler: MinMaxScaler = MinMaxScaler() 
co_stocks['value'] = scaler.fit_transform(co_stocks['value'].values.reshape(-1,1) )
stocks_df: 'DataFrame' = pd.DataFrame({ 'time' :co_stocks.index.tolist(), 'values':  co_stocks['value'].tolist() })
stocks_df['time'] = stocks_df['time'].apply(lambda row: row.timestamp())
sequence_length: int = 7 
X,y = [], []

for index in range(co_stocks.shape[0] - sequence_length):
    X.append(stocks_df[['time', 'values']][index: index + sequence_length].values)
    y.append(stocks_df['values'][ index + sequence_length] ) 

X = np.array(X)
y = np.array(y) 

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, shuffle=False)

Model One¶

In [226]:
model: 'LSTM' = tf.keras.models.Sequential([tf.keras.layers.LSTM(64, activation='relu', input_shape=(sequence_length, 2)),
                                            tf.keras.layers.Dense(1), 
                                            tf.keras.layers.Dropout(0.2), # prevent overfitting
                                            tf.keras.layers.BatchNormalization()     
                                            ])

model.compile(optimizer='adam', loss='mean_squared_error')
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

residuals = (y_test - model.predict(X_test).flatten() ).tolist()  # residuals 
Epoch 1/50
1/1 [==============================] - 2s 2s/step - loss: 1.0168 - val_loss: 704.9772
Epoch 2/50
1/1 [==============================] - 0s 37ms/step - loss: 0.9898 - val_loss: 226.8252
Epoch 3/50
1/1 [==============================] - 0s 47ms/step - loss: 1.0080 - val_loss: 135.5394
Epoch 4/50
1/1 [==============================] - 0s 37ms/step - loss: 0.9814 - val_loss: 52.9512
Epoch 5/50
1/1 [==============================] - 0s 37ms/step - loss: 1.0193 - val_loss: 47.7999
Epoch 6/50
1/1 [==============================] - 0s 43ms/step - loss: 0.9908 - val_loss: 76.5300
Epoch 7/50
1/1 [==============================] - 0s 36ms/step - loss: 1.0068 - val_loss: 60.8997
Epoch 8/50
1/1 [==============================] - 0s 37ms/step - loss: 1.0018 - val_loss: 51.9042
Epoch 9/50
1/1 [==============================] - 0s 38ms/step - loss: 0.9889 - val_loss: 45.9118
Epoch 10/50
1/1 [==============================] - 0s 37ms/step - loss: 0.9862 - val_loss: 38.7694
Epoch 11/50
1/1 [==============================] - 0s 36ms/step - loss: 0.9916 - val_loss: 34.1324
Epoch 12/50
1/1 [==============================] - 0s 38ms/step - loss: 0.9765 - val_loss: 32.8172
Epoch 13/50
1/1 [==============================] - 0s 37ms/step - loss: 0.9779 - val_loss: 28.9891
Epoch 14/50
1/1 [==============================] - 0s 38ms/step - loss: 0.9907 - val_loss: 25.5824
Epoch 15/50
1/1 [==============================] - 0s 38ms/step - loss: 0.9704 - val_loss: 23.6251
Epoch 16/50
1/1 [==============================] - 0s 39ms/step - loss: 0.9683 - val_loss: 22.7849
Epoch 17/50
1/1 [==============================] - 0s 40ms/step - loss: 0.9380 - val_loss: 20.5374
Epoch 18/50
1/1 [==============================] - 0s 38ms/step - loss: 0.9577 - val_loss: 9.3650
Epoch 19/50
1/1 [==============================] - 0s 36ms/step - loss: 0.9717 - val_loss: 8.9872
Epoch 20/50
1/1 [==============================] - 0s 38ms/step - loss: 0.9890 - val_loss: 8.6392
Epoch 21/50
1/1 [==============================] - 0s 38ms/step - loss: 0.9585 - val_loss: 8.3487
Epoch 22/50
1/1 [==============================] - 0s 38ms/step - loss: 0.9773 - val_loss: 7.8945
Epoch 23/50
1/1 [==============================] - 0s 38ms/step - loss: 0.9584 - val_loss: 7.3989
Epoch 24/50
1/1 [==============================] - 0s 37ms/step - loss: 0.9703 - val_loss: 6.9422
Epoch 25/50
1/1 [==============================] - 0s 37ms/step - loss: 0.9444 - val_loss: 6.6486
Epoch 26/50
1/1 [==============================] - 0s 37ms/step - loss: 1.0007 - val_loss: 6.3464
Epoch 27/50
1/1 [==============================] - 0s 38ms/step - loss: 0.9395 - val_loss: 5.9347
Epoch 28/50
1/1 [==============================] - 0s 41ms/step - loss: 0.9533 - val_loss: 5.5590
Epoch 29/50
1/1 [==============================] - 0s 39ms/step - loss: 0.9396 - val_loss: 11.5034
Epoch 30/50
1/1 [==============================] - 0s 39ms/step - loss: 0.9393 - val_loss: 10.5813
Epoch 31/50
1/1 [==============================] - 0s 37ms/step - loss: 0.9283 - val_loss: 9.7978
Epoch 32/50
1/1 [==============================] - 0s 36ms/step - loss: 0.9389 - val_loss: 9.0630
Epoch 33/50
1/1 [==============================] - 0s 38ms/step - loss: 0.9201 - val_loss: 8.4980
Epoch 34/50
1/1 [==============================] - 0s 36ms/step - loss: 0.9280 - val_loss: 8.0267
Epoch 35/50
1/1 [==============================] - 0s 36ms/step - loss: 0.9524 - val_loss: 7.7104
Epoch 36/50
1/1 [==============================] - 0s 36ms/step - loss: 0.9313 - val_loss: 7.2698
Epoch 37/50
1/1 [==============================] - 0s 34ms/step - loss: 0.9157 - val_loss: 6.7465
Epoch 38/50
1/1 [==============================] - 0s 38ms/step - loss: 0.9481 - val_loss: 6.2364
Out[226]:
<keras.src.callbacks.History at 0xffff24eeb340>

Evaluating Model One Residuals¶

  • Show no autocorrelations after lag zero (errors are random)
In [247]:
fig = plt.figure(figsize=(12,4)) 
ax = fig.add_subplot(1,2,1)
plot_acf(np.array(residuals),ax=ax)
ax.set_title("Model One Correlogram", fontweight='bold', fontsize=14)

ax = fig.add_subplot(1,2,2)
sm.qqplot( np.array(residuals), line='s', ax=ax)
ax.set_title("Model One QQ-Plot Residuals", fontweight='bold', fontsize=14)
plt.tight_layout() 
No description has been provided for this image

LJungbox Tests Model One¶

In [233]:
acorr_ljungbox(residuals)
Out[233]:
lb_stat lb_pvalue
1 0.842894 0.35857

Washington States Crude Oils Import/Export¶

In [258]:
wa_df: 'DataFrame' = pd.read_sql("""SELECT CAST(i.period AS DATETIME ) AS `timestamp`, 
                                            i.value 
                                    FROM import_export_washington i""", con=con).replace({np.nan:0.0} )

wa_stl = STL(wa_df.set_index('timestamp')['value'], period=7).fit()

Washington Seasonal Import/Export¶

In [274]:
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(2,2,1) 
wa_stl.seasonal.plot(ax=ax) 
ax.set_title("Seasonal WA Import/Export Crude Oils", fontweight='bold', fontsize= 16)

ax = fig.add_subplot(2,2,2) 
ax.set_title("Seasonal WA Import/Export Crude Oils", fontweight='bold', fontsize= 16)
wa_stl.observed.plot(ax=ax)

ax = fig.add_subplot(2,2,3) 
ax.set_title("Trends WA Import/Export Crude Oils", fontweight='bold', fontsize= 16)
wa_stl.trend.plot(ax=ax)

ax = fig.add_subplot(2,2,4) 
ax.set_title("Residuals WA Import/Export Crude Oils", fontweight='bold', fontsize= 16)
wa_stl.resid.plot(ax=ax)
Out[274]:
<Axes: title={'center': 'Residuals WA Import/Export Crude Oils'}, xlabel='timestamp'>
No description has been provided for this image

Weekly and Monthly Crude Oils Import/Export¶

In [281]:
fig = plt.figure(figsize=(22,5)) 
ax = fig.add_subplot(1,2,1) 
wa_df.set_index("timestamp")['value'].plot(ax=ax)
ax.set_title("Weekly Crude Oils Import/Exports in WA", fontweight='bold', fontsize=14)
ax.set_ylabel("Mbbl", fontweight='bold')

ax = fig.add_subplot(1,2,2) 
wa_df.set_index('timestamp')['value'].resample('M').mean().plot() 
ax.set_title("Monthly Crude Oils Import/Exports in WA", fontweight='bold', fontsize=14)
ax.set_ylabel("Mbbl", fontweight='bold')
plt.tight_layout()
No description has been provided for this image

Modeling Crude Oil Weekly Import/Export using LSTM Neural Networks¶

In [333]:
wa_df['time'] = wa_df['timestamp'].apply(lambda row: row.timestamp() )
wa_df['values'] = scaler.fit_transform(wa_df['value'].values.reshape(-1,1))
X,y = [], []

for index in range(wa_df.shape[0] - sequence_length):
    X.append(wa_df[['time', 'values']][index: index + sequence_length].values) 
    y.append(wa_df['values'][index + sequence_length] )

X = np.array(X)
y = np.array(y)

X1_train, X1_test, y1_train, y1_test = train_test_split(X,y, test_size=0.2, shuffle=False)

wa_model = tf.keras.models.Sequential([tf.keras.layers.LSTM(64, activation='relu', input_shape=(sequence_length, 2)),
                                       tf.keras.layers.Dense(1, activation='relu'),
                                       tf.keras.layers.Dropout(0.4), 
                                       tf.keras.layers.BatchNormalization()
                                      ])

wa_model.compile(optimizer='adam', loss='mean_squared_error')
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
hist = wa_model.fit(X1_train,y1_train, epochs=50, batch_size=32, validation_data=(X1_test, y1_test), callbacks=[early_stopping])

residuals: float = y1_test - wa_model.predict(X1_test).flatten() # compute LSTM model residuals
Epoch 1/50
26/26 [==============================] - 2s 18ms/step - loss: 0.0192 - val_loss: 0.0553
Epoch 2/50
26/26 [==============================] - 0s 9ms/step - loss: 0.0166 - val_loss: 0.0511
Epoch 3/50
26/26 [==============================] - 0s 9ms/step - loss: 0.0152 - val_loss: 0.0485
Epoch 4/50
26/26 [==============================] - 0s 9ms/step - loss: 0.0147 - val_loss: 0.0473
Epoch 5/50
26/26 [==============================] - 0s 9ms/step - loss: 0.0145 - val_loss: 0.0465
Epoch 6/50
26/26 [==============================] - 0s 9ms/step - loss: 0.0145 - val_loss: 0.0464
Epoch 7/50
26/26 [==============================] - 0s 11ms/step - loss: 0.0145 - val_loss: 0.0460
Epoch 8/50
26/26 [==============================] - 0s 9ms/step - loss: 0.0145 - val_loss: 0.0458
Epoch 9/50
26/26 [==============================] - 0s 9ms/step - loss: 0.0145 - val_loss: 0.0459
Epoch 10/50
26/26 [==============================] - 0s 10ms/step - loss: 0.0145 - val_loss: 0.0459
Epoch 11/50
26/26 [==============================] - 0s 9ms/step - loss: 0.0145 - val_loss: 0.0459
Epoch 12/50
26/26 [==============================] - 0s 9ms/step - loss: 0.0145 - val_loss: 0.0460
Epoch 13/50
26/26 [==============================] - 0s 9ms/step - loss: 0.0145 - val_loss: 0.0461
Epoch 14/50
26/26 [==============================] - 0s 9ms/step - loss: 0.0145 - val_loss: 0.0458
Epoch 15/50
26/26 [==============================] - 0s 9ms/step - loss: 0.0145 - val_loss: 0.0461
Epoch 16/50
26/26 [==============================] - 0s 9ms/step - loss: 0.0145 - val_loss: 0.0458
Epoch 17/50
26/26 [==============================] - 0s 9ms/step - loss: 0.0145 - val_loss: 0.0460
Epoch 18/50
26/26 [==============================] - 0s 9ms/step - loss: 0.0145 - val_loss: 0.0459
7/7 [==============================] - 0s 3ms/step

Model Crude Oils Import/Export Training and Validation Loss¶

In [334]:
fig = plt.figure() 
ax = fig.add_subplot()
pd.DataFrame({'Training Loss': hist.history.get('loss') , 'Validation Loss': hist.history.get('val_loss')}).plot(marker='s', mec='black', ax=ax)
ax.set_title("LSTM Model: Crude Oils Import/Export Training and Validation Loss", fontweight='bold', fontsize=16 ) 
ax.set_ylabel("Loss")
ax.set_xlabel("Epochs")
plt.tight_layout()
No description has been provided for this image

LSTM Model WA: Correlogram and QQ-Plot¶

  • The WA LSTM Model can't be used to forecast the weekly crude oil import/exports in WA state
  1. The residuals correlogram shows that there is autocorrelation after lag 6
  2. Ljungbox: All the p-value < 0.5 (lags=20)
  3. The Residuals QQ-Plot is not a straight line
In [352]:
fig = plt.figure(figsize=(12,4))
ax = fig.add_subplot(1,2, 1) 
plot_acf(np.array(residuals), ax=ax)
ax.set_title("Model Two Residuals Correlogram", fontweight='bold', fontsize=14)

ax = fig.add_subplot(1,2, 2)
sm.qqplot(np.array(residuals), line='s', ax=ax)
ax.set_title("Residuals QQ-Plot Model Two", fontweight='bold', fontsize=14)

plt.tight_layout()
No description has been provided for this image

Model Two: Ljungbox Test¶

In [349]:
pd.concat([acorr_ljungbox(np.array(residuals), lags=20 ), 
           acorr_ljungbox(np.array(residuals), lags=20 ).apply(lambda row: row > 0.5).rename({'lb_pvalue': 'p-value > 0.5' }, axis=1)['p-value > 0.5'] ], axis=1)
Out[349]:
lb_stat lb_pvalue p-value > 0.5
1 11.722117 6.176166e-04 False
2 20.114826 4.286681e-05 False
3 33.684786 2.309197e-07 False
4 56.164928 1.851706e-11 False
5 65.150249 1.043126e-12 False
6 71.750633 1.787758e-13 False
7 74.158761 2.124928e-13 False
8 84.223204 6.878695e-15 False
9 87.687357 4.733775e-15 False
10 88.635917 9.979998e-15 False
11 88.642342 3.074563e-14 False
12 92.321039 1.751812e-14 False
13 92.449349 4.743705e-14 False
14 92.746007 1.146518e-13 False
15 92.792671 2.979982e-13 False
16 95.472160 2.427259e-13 False
17 95.505714 6.009020e-13 False
18 96.454139 9.835653e-13 False
19 97.174375 1.733906e-12 False
20 100.027173 1.245582e-12 False
In [ ]: