ODIN Project¶
import boto3
import pandas as pd
import pymysql
import pymssql
import os
import json
import warnings
import numpy as np
import squarify as sq
from itertools import product
from datetime import datetime
from sqlalchemy import create_engine
from collections import defaultdict
import logging
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import acorr_ljungbox
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import STL
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from matplotlib.colors import ListedColormap
from typing import List, Dict
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 70
Project Resources¶
s3 = boto3.client('s3')
s3.download_file(Bucket=os.environ.get('TAX').split("/")[2], Key='/'.join(os.environ.get('TAX').split("/")[3:]), Filename="/tmp/fueltaxes.xlsx")
DB Auth¶
secrets = boto3.client('secretsmanager')
user1, passwd1, host1, db1, _ = list(json.loads(secrets.get_secret_value(SecretId=os.environ.get('MYSQL')).get('SecretString')).values())
con1: 'MYSQL' = pymysql.connect(user=user1, password=passwd1, db=db1, host=host1)
cursor1 = con1.cursor()
user2, passwd2, host2, db2 = list(json.loads(secrets.get_secret_value(SecretId=os.environ.get('MSSQL')).get('SecretString')).values())
con2: 'MSSQL' = pymssql.connect(user=user2, password=passwd2, database=db2, server=host2)
cursor2 = con2.cursor()
Fuel Taxes¶
Gasoline prices in the United States, a constant presence in our lives, are subject to various factors. Beyond the fundamental dynamics of supply and demand, the influence of taxation adds a complex layer to the composition of gasoline prices. This study embarks on a comprehensive exploration, delving into the multifaceted impact of state gasoline taxes on the cost of fuel.
- State Gasoline Taxes: State governments employ gasoline taxes to fund diverse projects, ranging from infrastructure endeavors to environmental conservation. The resultant disparity in tax rates shapes substantial price variations, affecting consumers and regional economies alike. Understanding the interplay between state budget priorities and gasoline taxes is vital for understanding the multifaceted nature of gasoline pricing.
- Taxation as a Price Driver: State governments have the power to impose gasoline taxes, both at the state and local levels. These taxes are calculated as a fixed amount per gallon or as a percentage of the retail price. As a result, the impact of taxation on gasoline prices is significant, often representing a substantial portion of the final cost consumers pay. The imposition of taxes contributes to the overall price structure and varies across states, leading to notable disparities in gasoline prices from one region to another.
- Consumer Behavior and Economic Impact: High gasoline taxes can alter consumer behavior, influencing travel patterns, vehicle purchases, and commuting choices. Regions with lower gasoline taxes may experience increased economic activity as consumers have more income, potentially leading to higher demand and pricing dynamics. On the other hand, areas with higher gasoline taxes might witness reduced consumption, impacting local economies. The interplay between consumer behavior, taxation, and pricing showcases the intricate balance taxation introduces to the gasoline market.
- Environmental and Policy Consideration: Some states leverage gasoline taxes as a policy tool to promote environmental sustainability. Higher gasoline taxes promote the adoption of fuel-efficient vehicles, public transportation, and cleaner energy alternatives. In this context, taxation influences prices and aligns with broader environmental and policy goals. The convergence of taxation and environmental considerations emphasizes the multifaceted nature of gasoline pricing.
Gasoline prices in the United States are a symphony of economic, regulatory, and societal factors. As a key player in this intricate composition, taxation introduces complex dynamics that ripple through the gasoline market. State gasoline taxes hold a multifaceted role beyond mere pricing, from funding critical infrastructure projects to shaping consumer behavior and advancing environmental objectives. A comprehensive comprehension of taxation's effects on gasoline prices is of utmost importance for individuals and policymakers, particularly as energy landscapes and policy priorities shift.
tax_df: 'DataFrame' = pd.read_excel("/tmp/fueltaxes.xlsx", sheet_name='January 2023')
tax_df = tax_df.iloc[7:63].drop(['Unnamed: 5', 'Unnamed: 10'], axis=1).rename(dict(map(lambda item: (item[0], item[1].strip()), tax_df.iloc[5].dropna().to_dict().items())), axis=1)
tax_df.index = np.arange(1, tax_df.shape[0] + 1)
tax_df = tax_df.replace({np.nan: 0.0})
tax_df.columns = [tax_df.columns[0].strip().lower()] +\
[ ('gasoline_%s' % (col)).strip().replace(' ', '_').lower() for col in tax_df.columns.tolist()[1:5] ] +\
[ ('diesel_%s' % (col)).strip().replace(' ', '_').lower() for col in tax_df.columns.tolist()[5:-1] ] +\
[tax_df.columns[-1].lower()]
states, gasoline_taxes = zip(*tax_df[tax_df.columns.tolist()[:5]].sort_values(by='gasoline_state_tax', ascending=True).set_index(tax_df.columns[0])['gasoline_state_tax'].to_dict().items())
all_states, state_fed_tax = zip(*tax_df[tax_df.columns.tolist()[:5]].sort_values(by='gasoline_state_&_federal', ascending=True).set_index(tax_df.columns[0])['gasoline_state_&_federal'].to_dict().items())
fig = plt.figure(figsize=(25,15) )
ax = fig.add_subplot(2,1,1)
# States
ax.bar(x=list(states), height=list(gasoline_taxes), hatch="//", edgecolor='k')
ax.set_xticklabels(ax.get_xticklabels(), rotation=70, fontweight='bold')
ax.set_title("Gasoline Tax by States", fontweight='bold', fontsize=20)
ax.set_ylabel("$/Gal", fontweight='bold')
plt.tight_layout()
# Fed
ax = fig.add_subplot(2,1,2)
ax.bar(x=list(all_states), height=list(state_fed_tax), hatch="/", edgecolor='k') # color='red')
ax.set_xticklabels(ax.get_xticklabels(), rotation=70, fontweight='bold')
ax.set_title("Gasoline Tax by Federal and State Tax", fontweight='bold', fontsize=20)
ax.set_ylabel("$/Gal", fontweight='bold')
plt.tight_layout()
Gasoline Mixtures¶
Gasoline prices, a constant presence in our daily routines, result from an intricate interplay of factors. Among these, the seasonal variation in gasoline mixture stands out as a critical yet often overlooked contributor to the cost at the pump. This study aims to understand the complexities of the gasoline mixture adjustments between summer and winter and the federal regulations that govern this process.
- Seasonal Gasoline Mixture Adjustments: The transition between summer and winter in the United States brings about more than just a change in weather. It prompts a fundamental shift in gasoline composition to cater to specific environmental and performance demands. During the summer months, gasoline is formulated with a different blend to mitigate the impact of higher temperatures on air quality. Additives like butane and ethanol are incorporated to enhance combustion efficiency and minimize emissions. On the other hand, winter gasoline is designed to ensure reliable engine performance in colder temperatures. With higher volatility, winter gasoline facilitates easier cold starts, preventing stalling and ensuring smooth engine operation. The blending process involves modifying the ratio of hydrocarbons, additives, and ethanol, resulting in a distinct mixture tailored to the unique challenges of winter weather.
- Butane Mixture: Butane is an inexpensive additive with high RVP. Adding butane, a cost-effective hydrocarbon, to gasoline is a double-edged sword. Butane's high RVP is advantageous for cold starts and improved combustion during colder months. Its volatile nature increases evaporation, potentially raising smog levels in warmer seasons. The utilization of butane in gasoline blends represents a delicate balance between cost-effectiveness and environmental impact, further influencing pricing dynamics.
- Federal Regulations and Reid Vapor Pressure (RVP): The complexity of seasonal gasoline mixtures is guided by federal regulations, specifically regarding RVP (Reid Vapor Pressure). RVP measures gasoline's volatility, influencing its evaporation rate and potential contribution to air pollution. The EPA (Environmental Protection Agency) enforces stringent regulations on gasoline RVP levels to mitigate smog formation during warmer months. Federal law restricts the sale of gasoline with an RVP exceeding 9.0 psi (pounds per square inch) from June 1 through September 15, a period characterized by higher temperatures and increased potential for smog formulation. Gasoline retailers must ensure compliance with these regulations, offering summer-blend gasoline with reduced RVP during this period. The transition to summer-blend gasoline involves meticulous adjustments by refineries to maintain compliance with federal guidelines.
The significant interaction between seasonal gasoline mixture adjustments and federal regulations exemplifies the intricate nature of gasoline pricing. The shift from summer to winter blends and vice versa goes beyond mere adaptation to weather conditions; it reflects a delicate balance between environmental concerns, engine performance, and regulatory compliance. Federal laws governing gasoline RVP levels during specific periods underscore the commitment to air quality preservation. Understanding these complexities is paramount as consumers and policymakers navigate the landscape of gasoline prices. Seasonal gasoline mixture adjustments, guided by federal regulations, are a testament to the multifaceted approach required to ensure efficient and environmentally conscious fuel consumption. In a world shaped by evolving energy standards and environmental consciousness, unraveling the intricate tapestry of gasoline mixture dynamics and federal oversight is a compass for a more sustainable energy future.
%%time
tbl_one: str = os.environ.get('tbl_one')
# 1.7e6
query: str = f"""
SELECT
lg.index,
lg.gas_station,
lg.country,
lg.city,
lg.address,
lg.zip_code,
lg.state,
lg.regular_gas,
lg.midgrade_gas,
lg.premium_gas,
lg.price_unit,
lg.star_rating,
lg.latitude,
lg.longitude,
lg.timestamp,
CASE
WHEN lg.timestamp BETWEEN STR_TO_DATE(CONCAT(YEAR(lg.timestamp),'-', 6, '-', DAY(lg.timestamp) ), '%Y-%m-%d' ) AND STR_TO_DATE(CONCAT(YEAR(lg.timestamp),'-', 9, '-', DAY(lg.timestamp) ), '%Y-%m-%d' ) THEN 'yes'
ELSE 'no'
END AS rvp,
CASE
WHEN MONTHNAME(lg.timestamp) IN ('March', 'April', 'May') THEN 'Spring'
WHEN MONTHNAME(lg.timestamp) IN ('June', 'July', 'August') THEN 'Summer'
WHEN MONTHNAME(lg.timestamp) IN ('September', 'October', 'November') THEN 'Fall'
ELSE 'Winter'
END AS `season`,
lg.price,
lg.review,
lg.review_date,
lg.sentiment_score
FROM {tbl_one} lg
"""
gasprices_df: 'DataFrame' = pd.read_sql(query,con=con1)
gasprices_df['transc_date'] = pd.to_datetime(gasprices_df['timestamp']).apply(lambda row: row.strftime('%Y-%m-%d'))
gasprices_df.head()
CPU times: user 16.6 s, sys: 852 ms, total: 17.5 s Wall time: 17.5 s
index | gas_station | country | city | address | zip_code | state | regular_gas | midgrade_gas | premium_gas | ... | latitude | longitude | timestamp | rvp | season | price | review | review_date | sentiment_score | transc_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1st Place Food Mart | US | Tillmans Corner | 7000 Three Notch-Kroner Rd | 36619 | AL | 1 | 1 | 1 | ... | 30.589132 | -88.205224 | 2023-05-21T12:10:31.001Z | no | Spring | 2.82 | love the cheaper gas. | 2022-05-02T18:39:36.45Z | 1.00000 | 2023-05-21 |
1 | 1 | Zippy Mart | US | Tillmans Corner | 5936 Three Notch Rd | 36619 | AL | 1 | 1 | 1 | ... | 30.589263 | -88.179333 | 2023-05-22T00:19:42.007Z | no | Spring | 2.82 | GOOD SERVICE | 2017-01-23T09:40:24.88Z | 0.57020 | 2023-05-22 |
2 | 2 | AAFES | US | Fort McClellan | 1167 Fremont Rd | 36205 | AL | 1 | 1 | 1 | ... | 33.731789 | -85.791281 | 2023-05-21T19:29:16.272Z | no | Spring | 2.82 | Military and family members only. They earned ... | 2019-06-27T20:13:50.033Z | -0.19800 | 2023-05-21 |
3 | 3 | Sam's Club | US | Auburn | 2335 Bent Creek Rd | 36830 | AL | 1 | 0 | 1 | ... | 32.608063 | -85.428061 | 2023-05-22T02:21:02.515Z | no | Spring | 2.84 | Generally best price in town. | 2022-11-02T19:20:19.87Z | 0.81845 | 2023-05-22 |
4 | 4 | Sunoco | US | Leeds | 7501 Parkway Dr | 35094 | AL | 1 | 0 | 1 | ... | 33.544691 | -86.554397 | 2023-05-21T13:38:20.812Z | no | Spring | 2.82 | great service and amazing Race Fuels | 2020-06-18T14:01:04.797Z | 0.76800 | 2023-05-21 |
5 rows × 22 columns
Remove Duplicates Entries from DB1¶
backup_df: 'DataFrame' = gasprices_df.drop_duplicates(['gas_station', 'timestamp', 'review', 'review_date', 'latitude' ,'longitude'])
backup_df.to_csv(f"{_}/dataset/backup_livegasprices_tbl_{datetime.utcnow().isoformat()}.csv")
cols: List[str] = gasprices_df.iloc[0].index.tolist()
url: str = "mysql+pymysql://%s:%s@%s:3306/%s" % (user1,passwd1,host1,db1)
cursor1.execute(f"DROP TABLE {tbl_one}")
con1.commit()
engine: 'MYSQL' = create_engine(url)
backup_df.to_sql(tbl_one, con=engine, if_exists='replace')
Explore Historical Gasoline Prices by States¶
Boxplot Gasoline Prices by States¶
fig = plt.figure(figsize=(25,10))
ax = fig.add_subplot()
gasprices_df.boxplot(by='state', column='price',ax=ax)
ax.set_title("Gasoline Prices by States", fontweight='bold', fontsize=20)
ax.set_ylabel("$/Gallon", fontweight='bold')
plt.tight_layout()
Histogram Gasoline Prices by States¶
plt.rcParams['figure.figsize'] = (35,25)
plt.rcParams['font.weight'] = 'bold'
axs = gasprices_df.hist(by='state', column='price')
states: List[str] = gasprices_df['state'].dropna().astype(str).apply(lambda row: row.strip()).unique().tolist()
Average Gasoline Prices and Number of Visitors¶
states, avg_price = zip(*gasprices_df.groupby('state')['price'].agg('mean').to_dict().items())
fig = plt.figure(figsize=(25,15))
ax = fig.add_subplot(2,1,1)
sq.plot(sizes=list(avg_price),
label=[ f"{item[0]}: ${item[1]:.2f}" for item in zip(states, avg_price) ],
ax=ax,
color=sns.color_palette(palette='Blues', n_colors=51)
)
ax.axis('off')
ax.invert_xaxis()
ax.set_title("Average Gasoline Prices by U.S States", fontweight='bold', fontsize=20)
gas_stations, n_transcs = zip(*gasprices_df['gas_station'].value_counts().head(25).items())
ax = fig.add_subplot(2,1,2)
ax.bar(x=list(gas_stations), height=list(n_transcs),edgecolor='k', hatch='/')
ax.set_xticklabels(ax.get_xticklabels(), fontweight='bold', rotation=75)
ax.set_title("Top 25 Gas Stations in the United States", fontweight='bold', fontsize=20)
for index,visitor in enumerate(n_transcs):
ax.annotate(visitor, xy=(index - 0.23,visitor), fontweight='bold')
ax.set_ylabel('Number of Visitors', fontweight='bold' )
plt.tight_layout()
Average Gasoline Prices by States and Gas Stations¶
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot()
sns.heatmap( pd.pivot_table(data=gasprices_df[gasprices_df['gas_station'].isin(gas_stations)], columns=['state'], index=['gas_station'], values='price', aggfunc='mean').replace({np.nan:0.0}),
annot=True,
ax=ax,
center=True,
cmap=ListedColormap(sns.color_palette(palette='Blues_r', n_colors=60) ))
ax.set_title("Average Gasoline Prices by States and Gas Stations", fontweight='bold', fontsize=20)
plt.tight_layout()
Trends: Total Average Gasoline Price in the U.S¶
avg_gasoline_prices: 'Series' = gasprices_df.groupby('transc_date')['price'].mean()
avg_gasoline_prices.index = pd.to_datetime(avg_gasoline_prices.index )
fig = plt.figure(figsize=(10,4))
ax = fig.add_subplot()
avg_gasoline_prices.plot( marker='s', mec='black', mew=1.8) # Average Gasoline Prices in the U.S
ax.set_title("Total Average Gasoline Price in the U.S", fontweight='bold', fontsize=20)
ax.set_ylabel("$/Gallon", fontweight='bold')
plt.tight_layout()
Modeling Gasoline Prices in Washington State using SARIMAX¶
- Exo variables:
- Season (Not enough data)
- RVP
- Gasoline Types
Note:
- source_one was collected from May 19,2023, until today.
- source_two was collected from August 04, 2023, until today.
query: str = f"""
/*
Gasoline price in State of Washington
*/
SELECT
g.gas_station_name,
g.pay_status,
g.posted_time,
g.address,
g.zip_code,
g.payment_type,
g.price
FROM {mssql_func_one}('WA') g ORDER BY CAST( posted_time AS DATETIME ) ASC
"""
wa_prices_source2: 'DATAFRAME' = pd.read_sql(query, con=con2) # Source 2
wa_prices_source2.head()
gas_station_name | pay_status | posted_time | address | zip_code | payment_type | price | |
---|---|---|---|---|---|---|---|
0 | Swinomish Markets at the Casio | 1 | 2023-08-04 18:23:19.927 | 12939 Casino Dr | 98221-8351 | cash | 4.15 |
1 | Swinomish Markets at the Casio | 1 | 2023-08-04 18:23:19.927 | 12939 Casino Dr | 98221-8351 | cash | 4.15 |
2 | Hwy 97 Truck Plaza | 1 | 2023-08-05 01:41:02.793 | 61313 Hwy 97 | 98948 | credit_card | 4.15 |
3 | Hwy 97 Truck Plaza | 1 | 2023-08-05 01:41:02.793 | 61313 Hwy 97 | 98948 | credit_card | 4.15 |
4 | Meridian Liquor Store | 1 | 2023-08-05 12:14:32.347 | 4209 Meridian St | 98226-5512 | cash | 4.39 |
wa_prices_source2.groupby(['gas_station_name', 'payment_type'])['price'].mean().unstack('payment_type').replace({np.nan: 0.0}).applymap(lambda row: round(row, 2) ).replace({0.0: "N/A"})
payment_type | cash | credit_card |
---|---|---|
gas_station_name | ||
76 | 4.3 | N/A |
ARCO | 4.33 | N/A |
Angel of the Winds Fuel | N/A | 4.45 |
Costco | N/A | 4.3 |
Harolds Market | N/A | 4.1 |
Hwy 97 Truck Plaza | N/A | 4.3 |
JK Gas & Groceries | 4.39 | N/A |
Little Mountain Grocery | 4.4 | N/A |
Maverik | N/A | 4.37 |
Meridian Liquor Store | 4.44 | N/A |
Midway Gas & Food | 4.42 | N/A |
Safeway | 4.35 | 4.36 |
Samcor Fuel | N/A | 4.39 |
Sinclair | N/A | 4.3 |
Super Duper | 4.45 | N/A |
Super Gas | N/A | 4.47 |
Sure Save Grocery | N/A | 4.49 |
Swinomish Markets at the Casio | 4.2 | N/A |
Swinomish Markets at the Villa | 4.25 | N/A |
The Tote | N/A | 4.4 |
Topp Mart | N/A | 4.29 |
Walmart | N/A | 4.33 |
Wheelers Kountry Korner | 4.14 | N/A |
Yakamart | N/A | 4.2 |
Yakima Ave. Quick Stop | 4.39 | N/A |
plt.style.use('ggplot')
gasprices_df['transc_date'] = pd.to_datetime(gasprices_df['transc_date'])
gasprices_df['rvp_yes'] = pd.get_dummies( gasprices_df['rvp'], prefix='rvp' ).replace({True: 1, False: 0})['rvp_yes']
gasprices_df['timestamp'] = pd.to_datetime(gasprices_df['timestamp'])
wa_prices_df: 'DataFrame' = gasprices_df.query("state == 'WA' ")
wa_prices_df['month_name'] = pd.to_datetime(wa_prices_df['timestamp']).apply(lambda row: row.month_name())
sort_months: List[str] = pd.Series(pd.date_range(start='05-01-2023' , end='09-01-2023', freq='M')).apply(lambda row: row.month_name()).tolist()
gasprice_by_city: 'DataFrame' = pd.pivot_table(data=wa_prices_df, index=['month_name'], columns=['city'], aggfunc='mean', values='price').replace({np.nan: 0.0}).transpose()[sort_months]
pd.DataFrame( gasprices_df.query("state == 'WA' ").groupby('season')['price'].mean().replace({0.0: np.nan}).dropna()).rename({'price': 'Gasoline Price in WA'}, axis=1)
Gasoline Price in WA | |
---|---|
season | |
Spring | 3.961230 |
Summer | 4.172369 |
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot()
sns.heatmap(gasprice_by_city, annot=True, center=True ,ax=ax, cmap=ListedColormap(sns.color_palette(palette='flare', n_colors=70)), linewidths=0.7, linecolor='black')
ax.set_title("AVG Gasoline Price by the City", fontweight='bold', fontsize=20)
ax.set_xticklabels(ax.get_xticklabels(), fontweight='bold')
ax.set_yticklabels(ax.get_yticklabels(), fontweight='bold')
plt.tight_layout()
fig = plt.figure(figsize=(25,8))
ax = fig.add_subplot(1,3,1)
gasprices_df.query("state == 'WA' ").groupby('transc_date')['price'].mean().plot(ax=ax, marker='s', mec='black')
ax.set_title("Gasoline Historical Price in WA", fontweight='bold', fontsize=20)
ax.set_ylabel("$/Gallon", fontweight='bold')
ax = fig.add_subplot(1,3,2)
gasprices_df.query("state == 'WA' ").groupby('transc_date')['price'].mean().resample("W").mean().interpolate().plot(ax=ax, marker='s', mec='black')
ax.set_title("Gasoline Weekly Historical Price in WA", fontweight='bold', fontsize=20)
ax.set_ylabel("$/Gallon", fontweight='bold')
ax = fig.add_subplot(1,3,3)
gasprices_df.query("state == 'WA' ").groupby('transc_date')['price'].mean().resample("M").mean().interpolate().plot(ax=ax, marker='s', mec='black')
ax.set_title("Gasoline Monthly Historical Price in WA", fontweight='bold', fontsize=20)
ax.set_ylabel("$/Gallon", fontweight='bold')
plt.tight_layout()
stl: 'STL' = STL(gasprices_df.query("state == 'WA' ").groupby('transc_date')['price'].mean(), period=365).fit()
fig = plt.figure(figsize=(20,8))
ax = fig.add_subplot(2,2,1)
stl.observed.plot(ax=ax)
ax.set_title("WA Observed Gasoline Prices", fontweight='bold', fontsize=20)
ax = fig.add_subplot(2,2,2)
stl.trend.plot(ax=ax)
ax.set_title("WA Trend Gasoline Prices", fontweight='bold', fontsize=20)
ax = fig.add_subplot(2,2,3)
stl.seasonal.plot(ax=ax)
ax.set_title("WA Seasonal Gasoline Prices", fontweight='bold', fontsize=20)
ax = fig.add_subplot(2,2,4)
stl.resid.plot(ax=ax)
ax.set_title("WA Residual Gasoline Prices", fontweight='bold', fontsize=20)
plt.tight_layout()
Autocorrelation and Partial Autocorrelation Plot¶
Note: the decaying acf plot shows that there is a partial autocorrelation function
daily_price: 'Series' = wa_prices_df.groupby('transc_date')['price'].mean()
daily_price.index = pd.to_datetime(daily_price.index)
# the decaying acf functions shows that there is a partial autocorrelation
fig = plt.figure(figsize=(12,4))
ax = fig.add_subplot(1,2,1)
acf = plot_acf(daily_price, ax=ax)
ax = fig.add_subplot(1,2,2)
pacf = plot_pacf(daily_price, ax=ax) # AR(1)
Adfuller Test¶
- Integration Order: 1
- Seasonal Integration Order: 0
adf_one: 'adfuller' = adfuller(daily_price)
pd.DataFrame( {'ADF':adf_one[0],
'P-Value': adf_one[1]},
index=['ADF Test']).transpose()
ADF Test | |
---|---|
ADF | -1.482823 |
P-Value | 0.541991 |
adf_two: 'adfuler' = adfuller(np.diff(daily_price, n=1))
pd.DataFrame( {'ADF':adf_two[0],
'P-Value': adf_two[1]},
index=['ADF Test']).transpose()
ADF Test | |
---|---|
ADF | -3.756059 |
P-Value | 0.003390 |
logging.basicConfig(level=logging.INFO)
log: 'LOG' = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
mem_steps: str = f"{path}/pickles/compute_job.pkl"
train: 'Series' = daily_price.sort_index().iloc[:-14]
test: 'Series' = daily_price.sort_index().iloc[-14:]
d: int = 1
D: int = 0
s: int = 365
orders: List[tuple] = list(product(range(3),range(3),range(3),range(3)))
results: defaultdict = defaultdict(list)
for p,q,P,Q in orders:
if os.path.exists(mem_steps):
current_job: 'DataFrame' = pd.read_pickle(mem_steps)
if (True in current_job.apply(lambda row: True if row['p'] == p and row['q'] == q and row['P'] == P and row['Q'] == Q else False, axis=1).tolist()):
continue
model: SARIMAX = ARIMA(endog=train,
order=(p,d,q),
seasonal_order=(P,D,Q,s),
# enforce_stationarity=False
).fit( cov_type='none', low_memory=True)
#method='cg') # SARIMAX
results['p'].append(p)
results['q'].append(q)
results['P'].append(P)
results['Q'].append(Q)
results['AIC'].append(model.aic)
log.debug("[ + ] (%s,%s,%s,%s) AIC:%s" % (p,q,P,Q,model.aic ) )
if os.path.exists(mem_steps):
temp: List[Dict] = pd.DataFrame(results).to_dict(orient='records')
temp.extend(pd.read_pickle(mem_steps).to_dict(orient='records'))
pd.DataFrame(temp).drop_duplicates(['p', 'q', 'P', 'Q', 'AIC'] ).to_pickle(mem_steps)
else:
current_job = pd.DataFrame(results)
current_job.to_pickle(mem_steps)
DEBUG:__main__:[ + ] (2,1,2,0) AIC:-205.79998795407818 DEBUG:__main__:[ + ] (2,1,2,1) AIC:-203.7999960153758 DEBUG:__main__:[ + ] (2,1,2,2) AIC:-201.799999977549 DEBUG:__main__:[ + ] (2,2,0,0) AIC:-218.48257369948615 DEBUG:__main__:[ + ] (2,2,0,1) AIC:-216.48256084068456 DEBUG:__main__:[ + ] (2,2,0,2) AIC:-214.48259766285858 DEBUG:__main__:[ + ] (2,2,1,0) AIC:-216.4826017333369 DEBUG:__main__:[ + ] (2,2,1,1) AIC:-214.48260172434667 DEBUG:__main__:[ + ] (2,2,1,2) AIC:-212.48246039543835 DEBUG:__main__:[ + ] (2,2,2,0) AIC:-214.48256125980177 DEBUG:__main__:[ + ] (2,2,2,1) AIC:-212.48259922583176 DEBUG:__main__:[ + ] (2,2,2,2) AIC:-210.48257705355968
pd.read_pickle(mem_steps).sort_values(by='AIC').head(25)
p | q | P | Q | AIC | |
---|---|---|---|---|---|
3 | 2 | 2 | 0 | 0 | -218.482574 |
6 | 2 | 2 | 1 | 0 | -216.482602 |
4 | 2 | 2 | 0 | 1 | -216.482561 |
7 | 2 | 2 | 1 | 1 | -214.482602 |
5 | 2 | 2 | 0 | 2 | -214.482598 |
9 | 2 | 2 | 2 | 0 | -214.482561 |
10 | 2 | 2 | 2 | 1 | -212.482599 |
8 | 2 | 2 | 1 | 2 | -212.482460 |
11 | 2 | 2 | 2 | 2 | -210.482577 |
30 | 0 | 1 | 0 | 0 | -209.339313 |
75 | 2 | 0 | 0 | 0 | -209.040195 |
66 | 1 | 2 | 0 | 0 | -208.699817 |
48 | 1 | 0 | 0 | 0 | -208.633456 |
39 | 0 | 2 | 0 | 0 | -208.593247 |
57 | 1 | 1 | 0 | 0 | -207.656525 |
84 | 2 | 1 | 0 | 0 | -207.548823 |
31 | 0 | 1 | 0 | 1 | -207.339313 |
33 | 0 | 1 | 1 | 0 | -207.339313 |
78 | 2 | 0 | 1 | 0 | -207.040195 |
76 | 2 | 0 | 0 | 1 | -207.040184 |
69 | 1 | 2 | 1 | 0 | -206.699919 |
67 | 1 | 2 | 0 | 1 | -206.699856 |
51 | 1 | 0 | 1 | 0 | -206.633456 |
49 | 1 | 0 | 0 | 1 | -206.633456 |
42 | 0 | 2 | 1 | 0 | -206.593247 |
ARIMA Model: Gasoline Prices in WA¶
- order: p:2 d:1 q:2
- seasonal_order: P:0 D:0 Q:0 s:365
p,d,q = 2,1,2
P,D,Q,s = 0,0,0,365
model: ARIMA = ARIMA(endog=train.sort_index(),
order=(p,d,q),
seasonal_order=(P,D,Q,s)).fit( cov_type='none', low_memory=True)
model.summary()
Dep. Variable: | price | No. Observations: | 71 |
---|---|---|---|
Model: | ARIMA(2, 1, 2) | Log Likelihood | 114.241 |
Date: | Wed, 16 Aug 2023 | AIC | -218.483 |
Time: | 18:32:57 | BIC | -207.240 |
Sample: | 0 | HQIC | -214.017 |
- 71 | |||
Covariance Type: | Not computed |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
ar.L1 | 0.0219 | nan | nan | nan | nan | nan |
ar.L2 | -0.6730 | nan | nan | nan | nan | nan |
ma.L1 | -0.4471 | nan | nan | nan | nan | nan |
ma.L2 | 0.8773 | nan | nan | nan | nan | nan |
sigma2 | 0.0022 | nan | nan | nan | nan | nan |
Ljung-Box (L1) (Q): | 0.01 | Jarque-Bera (JB): | 2.15 |
---|---|---|---|
Prob(Q): | 0.91 | Prob(JB): | 0.34 |
Heteroskedasticity (H): | 0.73 | Skew: | -0.35 |
Prob(H) (two-sided): | 0.46 | Kurtosis: | 3.49 |
Warnings:
[1] Covariance matrix not calculated.
ARIMA Diagnostic Plots¶
- This ARIMA is good because:
- The q-q plots show almost a straight line
- The histogram plot shows almost a normal distribution
- Correlogram shows no autocorrelation after lag zero (the error is random)
model_one = model.plot_diagnostics(figsize=(25,10)) #
Ljungbox Test for ARIMA Model¶
- The pvalue > 0.05, which indicates that the ARIMA model is good
acorr_ljungbox(model.resid)
lb_stat | lb_pvalue | |
---|---|---|
1 | 0.043151 | 0.835441 |
2 | 0.077116 | 0.962176 |
3 | 0.090288 | 0.992977 |
4 | 0.110965 | 0.998517 |
5 | 0.127165 | 0.999707 |
6 | 0.128713 | 0.999958 |
7 | 0.133871 | 0.999994 |
8 | 0.135984 | 0.999999 |
9 | 0.190538 | 1.000000 |
10 | 0.208261 | 1.000000 |
fig = plt.figure()
ax = fig.add_subplot()
acorr_ljungbox(model.resid, lags=20)['lb_pvalue'].rename({'lb_pvalue': 'p_value'}).plot(ax=ax, marker='s', mec='black')
ax.set_ylabel("P-Values", fontweight='bold')
ax.set_xlabel("Lags", fontweight='bold')
ax.set_title("Ljungbox Test ARIMA Model", fontweight='bold', fontsize=20)
plt.tight_layout()
Forecast V.S Actual¶
model_one_forecast: 'DataFrame' = pd.DataFrame( model.forecast(steps=5) )
model_one_forecast.index = pd.date_range(start='2023-08-03', end=pd.Series(pd.to_datetime("2023-08-02") + pd.offsets.BDay(3)).astype(str).tolist()[0] , freq='D')
pd.concat([ model_one_forecast, test.head(5) ], axis=1)
predicted_mean | price | |
---|---|---|
2023-08-03 | 4.280549 | 4.289740 |
2023-08-04 | 4.301888 | 4.327059 |
2023-08-05 | 4.322686 | 4.348551 |
2023-08-06 | 4.308781 | 4.293377 |
2023-08-07 | 4.294479 | 4.302000 |
Rolling Forecast¶
WINDOW: int = 1
last_known_price: List = []
model_one: List = []
for index in range(len(train), daily_price.shape[0], WINDOW):
model: ARIMA = ARIMA(endog=train.sort_index(),
order=(p,d,q),
seasonal_order=(P,D,Q,s)).fit( cov_type='none', low_memory=True)
last_known_price.append(daily_price.iloc[index-WINDOW: index].tolist()[0] )
model_one.append(model.get_prediction(0, index + WINDOW - 1).predicted_mean.iloc[-WINDOW:].tolist()[0]) # gasoline_price in WA @day
gasprice_wa: 'DataFrame' = pd.DataFrame({'actual_price': test.tolist(),
'bench_mark': last_known_price,
'model_one': model_one
})
gasprice_wa.index = test.index
gasprice_wa
actual_price | bench_mark | model_one | |
---|---|---|---|
transc_date | |||
2023-08-03 | 4.289740 | 4.310758 | 4.280549 |
2023-08-04 | 4.327059 | 4.289740 | 4.301888 |
2023-08-05 | 4.348551 | 4.327059 | 4.322686 |
2023-08-06 | 4.293377 | 4.348551 | 4.308781 |
2023-08-07 | 4.302000 | 4.293377 | 4.294479 |
2023-08-08 | 4.319149 | 4.302000 | 4.303523 |
2023-08-09 | 4.337143 | 4.319149 | 4.313346 |
2023-08-10 | 4.351569 | 4.337143 | 4.307475 |
2023-08-11 | 4.357353 | 4.351569 | 4.300736 |
2023-08-12 | 4.367761 | 4.357353 | 4.304539 |
2023-08-13 | 4.359437 | 4.367761 | 4.309158 |
2023-08-14 | 4.311438 | 4.359437 | 4.306700 |
2023-08-15 | 4.334505 | 4.311438 | 4.303537 |
2023-08-16 | 4.245902 | 4.334505 | 4.305122 |
fig = plt.figure(figsize=(12,4))
ax = fig.add_subplot()
gasprice_wa.plot(ax=ax, marker='s', mec='black')
ax.set_title("Gasoline Prices in Washington State", fontweight='bold', fontsize=20)
ax.set_ylabel("$/Gallon", fontweight='bold')
ax.set_xlabel("Transcation Date", fontweight='bold')
ax.legend(bbox_to_anchor=(1,1), title="Model Comparison", title_fontsize='medium', shadow=True)
plt.tight_layout()
y_true: 'Series' = test
y_pred: 'Series' = model_one
y_pred_one: 'Series' = last_known_price
pd.DataFrame({'model_one': metrics.mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred),
'bench_mark': metrics.mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred_one)}, index=['MAPE']).transpose()
MAPE | |
---|---|
model_one | 0.007120 |
bench_mark | 0.006264 |
pd.DataFrame({'model_one': metrics.mean_absolute_error(y_true=y_true, y_pred=y_pred),
'bench_mark': metrics.mean_absolute_error(y_true=y_true, y_pred=y_pred_one)}, index=['MAE']).transpose()
MAE | |
---|---|
model_one | 0.030837 |
bench_mark | 0.026956 |