import boto3 
import pandas as pd 
import pymysql
import pymssql 
import os 
import json 
import warnings 
import numpy as np 
import squarify as sq 
from itertools import product
from datetime import datetime 
from sqlalchemy import create_engine
from collections import defaultdict
import logging 

from statsmodels.tsa.statespace.sarimax import SARIMAX 
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import acorr_ljungbox
from sklearn import metrics

import matplotlib.pyplot as plt 
import seaborn as sns 
from statsmodels.tsa.seasonal import STL
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from matplotlib.colors import ListedColormap

from typing import List, Dict 

warnings.filterwarnings('ignore')
pd.options.display.max_rows = 70

s3 = boto3.client('s3')
s3.download_file(Bucket=os.environ.get('TAX').split("/")[2], Key='/'.join(os.environ.get('TAX').split("/")[3:]), Filename="/tmp/fueltaxes.xlsx")

secrets = boto3.client('secretsmanager')
user1, passwd1, host1, db1, _ = list(json.loads(secrets.get_secret_value(SecretId=os.environ.get('MYSQL')).get('SecretString')).values())
con1: 'MYSQL' = pymysql.connect(user=user1, password=passwd1, db=db1, host=host1)
cursor1 = con1.cursor()

user2, passwd2, host2, db2 = list(json.loads(secrets.get_secret_value(SecretId=os.environ.get('MSSQL')).get('SecretString')).values())
con2: 'MSSQL' = pymssql.connect(user=user2, password=passwd2, database=db2, server=host2)
cursor2 = con2.cursor()

tax_df: 'DataFrame' = pd.read_excel("/tmp/fueltaxes.xlsx", sheet_name='January 2023')
tax_df = tax_df.iloc[7:63].drop(['Unnamed: 5', 'Unnamed: 10'], axis=1).rename(dict(map(lambda item: (item[0], item[1].strip()), tax_df.iloc[5].dropna().to_dict().items())), axis=1)
tax_df.index = np.arange(1, tax_df.shape[0] + 1)
tax_df = tax_df.replace({np.nan: 0.0})
tax_df.columns = [tax_df.columns[0].strip().lower()] +\
                 [ ('gasoline_%s' % (col)).strip().replace(' ', '_').lower() for col in tax_df.columns.tolist()[1:5] ] +\
                 [ ('diesel_%s' % (col)).strip().replace(' ', '_').lower() for col in tax_df.columns.tolist()[5:-1] ] +\
                 [tax_df.columns[-1].lower()]

states, gasoline_taxes = zip(*tax_df[tax_df.columns.tolist()[:5]].sort_values(by='gasoline_state_tax', ascending=True).set_index(tax_df.columns[0])['gasoline_state_tax'].to_dict().items())
all_states, state_fed_tax = zip(*tax_df[tax_df.columns.tolist()[:5]].sort_values(by='gasoline_state_&_federal', ascending=True).set_index(tax_df.columns[0])['gasoline_state_&_federal'].to_dict().items())

fig = plt.figure(figsize=(25,15) )
ax = fig.add_subplot(2,1,1)

# States 
ax.bar(x=list(states), height=list(gasoline_taxes), hatch="//", edgecolor='k')
ax.set_xticklabels(ax.get_xticklabels(), rotation=70, fontweight='bold')
ax.set_title("Gasoline Tax by States", fontweight='bold', fontsize=20)
ax.set_ylabel("$/Gal", fontweight='bold')
plt.tight_layout() 

# Fed
ax = fig.add_subplot(2,1,2)
ax.bar(x=list(all_states), height=list(state_fed_tax), hatch="/", edgecolor='k') # color='red')
ax.set_xticklabels(ax.get_xticklabels(), rotation=70, fontweight='bold')
ax.set_title("Gasoline Tax by Federal and State Tax", fontweight='bold', fontsize=20)
ax.set_ylabel("$/Gal", fontweight='bold')
plt.tight_layout()

%%time 
tbl_one: str = os.environ.get('tbl_one') 
# 1.7e6 
query: str = f"""

    SELECT
        lg.index,
        lg.gas_station,
        lg.country,
        lg.city,
        lg.address,
        lg.zip_code,
        lg.state,
        lg.regular_gas,
        lg.midgrade_gas,
        lg.premium_gas,
        lg.price_unit,
        lg.star_rating,
        lg.latitude,
        lg.longitude,
        lg.timestamp,
        CASE 
            WHEN lg.timestamp BETWEEN STR_TO_DATE(CONCAT(YEAR(lg.timestamp),'-', 6, '-', DAY(lg.timestamp) ), '%Y-%m-%d' ) AND STR_TO_DATE(CONCAT(YEAR(lg.timestamp),'-', 9, '-', DAY(lg.timestamp) ), '%Y-%m-%d' ) THEN 'yes'
            ELSE 'no'
        END AS rvp,
        CASE 
            WHEN MONTHNAME(lg.timestamp) IN ('March', 'April', 'May') THEN 'Spring'
            WHEN MONTHNAME(lg.timestamp) IN ('June', 'July', 'August') THEN 'Summer'
            WHEN MONTHNAME(lg.timestamp) IN ('September', 'October', 'November') THEN 'Fall'
            ELSE 'Winter'
        END AS `season`, 
        lg.price,
        lg.review,
        lg.review_date,
        lg.sentiment_score
        
    FROM {tbl_one} lg 
""" 

gasprices_df: 'DataFrame' = pd.read_sql(query,con=con1)
gasprices_df['transc_date'] = pd.to_datetime(gasprices_df['timestamp']).apply(lambda row: row.strftime('%Y-%m-%d'))

gasprices_df.head()

CPU times: user 16.6 s, sys: 852 ms, total: 17.5 s
Wall time: 17.5 s

backup_df: 'DataFrame' = gasprices_df.drop_duplicates(['gas_station', 'timestamp', 'review', 'review_date', 'latitude' ,'longitude']) 
backup_df.to_csv(f"{_}/dataset/backup_livegasprices_tbl_{datetime.utcnow().isoformat()}.csv")
cols: List[str] = gasprices_df.iloc[0].index.tolist()
url: str = "mysql+pymysql://%s:%s@%s:3306/%s"  % (user1,passwd1,host1,db1)

cursor1.execute(f"DROP TABLE {tbl_one}")
con1.commit() 

engine: 'MYSQL' = create_engine(url)
backup_df.to_sql(tbl_one, con=engine, if_exists='replace')

fig = plt.figure(figsize=(25,10)) 
ax = fig.add_subplot() 
gasprices_df.boxplot(by='state', column='price',ax=ax)
ax.set_title("Gasoline Prices by States", fontweight='bold', fontsize=20)
ax.set_ylabel("$/Gallon", fontweight='bold')

plt.tight_layout()

plt.rcParams['figure.figsize'] = (35,25)
plt.rcParams['font.weight'] = 'bold'
axs = gasprices_df.hist(by='state', column='price')
states: List[str] = gasprices_df['state'].dropna().astype(str).apply(lambda row: row.strip()).unique().tolist()

states, avg_price = zip(*gasprices_df.groupby('state')['price'].agg('mean').to_dict().items())
fig = plt.figure(figsize=(25,15))
ax = fig.add_subplot(2,1,1) 

sq.plot(sizes=list(avg_price), 
        label=[ f"{item[0]}: ${item[1]:.2f}" for item in zip(states, avg_price) ],
        ax=ax,
        color=sns.color_palette(palette='Blues', n_colors=51)
       )

ax.axis('off')
ax.invert_xaxis()
ax.set_title("Average Gasoline Prices by U.S States", fontweight='bold', fontsize=20)

gas_stations, n_transcs = zip(*gasprices_df['gas_station'].value_counts().head(25).items())
ax = fig.add_subplot(2,1,2)
ax.bar(x=list(gas_stations), height=list(n_transcs),edgecolor='k', hatch='/')
ax.set_xticklabels(ax.get_xticklabels(), fontweight='bold', rotation=75)
ax.set_title("Top 25 Gas Stations in the United States", fontweight='bold', fontsize=20)
for index,visitor in enumerate(n_transcs): 
    ax.annotate(visitor, xy=(index - 0.23,visitor), fontweight='bold')
ax.set_ylabel('Number of Visitors', fontweight='bold' )


plt.tight_layout()

fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot()
sns.heatmap( pd.pivot_table(data=gasprices_df[gasprices_df['gas_station'].isin(gas_stations)], columns=['state'], index=['gas_station'], values='price', aggfunc='mean').replace({np.nan:0.0}), 
            annot=True, 
            ax=ax, 
            center=True,
            cmap=ListedColormap(sns.color_palette(palette='Blues_r', n_colors=60) ))
ax.set_title("Average Gasoline Prices by States and Gas Stations", fontweight='bold', fontsize=20)

plt.tight_layout()

avg_gasoline_prices: 'Series' = gasprices_df.groupby('transc_date')['price'].mean()
avg_gasoline_prices.index = pd.to_datetime(avg_gasoline_prices.index )

fig = plt.figure(figsize=(10,4)) 
ax = fig.add_subplot()
avg_gasoline_prices.plot( marker='s', mec='black', mew=1.8)  # Average Gasoline Prices in the U.S
ax.set_title("Total Average Gasoline Price in the U.S", fontweight='bold', fontsize=20)
ax.set_ylabel("$/Gallon", fontweight='bold')

plt.tight_layout()

query: str = f"""
                /*
                    Gasoline price in State of Washington 
                */
                SELECT 
                    g.gas_station_name, 
                    g.pay_status, 
                    g.posted_time,
                    g.address, 
                    g.zip_code,
                    g.payment_type,
                    g.price
                
                FROM {mssql_func_one}('WA') g ORDER BY CAST( posted_time AS DATETIME ) ASC
                
            """


wa_prices_source2: 'DATAFRAME' = pd.read_sql(query, con=con2) # Source 2 
wa_prices_source2.head()

wa_prices_source2.groupby(['gas_station_name', 'payment_type'])['price'].mean().unstack('payment_type').replace({np.nan: 0.0}).applymap(lambda row: round(row, 2) ).replace({0.0: "N/A"})

plt.style.use('ggplot')
gasprices_df['transc_date'] = pd.to_datetime(gasprices_df['transc_date'])
gasprices_df['rvp_yes'] = pd.get_dummies( gasprices_df['rvp'], prefix='rvp' ).replace({True: 1, False: 0})['rvp_yes']
gasprices_df['timestamp'] = pd.to_datetime(gasprices_df['timestamp'])

wa_prices_df: 'DataFrame' = gasprices_df.query("state == 'WA' ")
wa_prices_df['month_name'] = pd.to_datetime(wa_prices_df['timestamp']).apply(lambda row: row.month_name())
sort_months: List[str] = pd.Series(pd.date_range(start='05-01-2023' , end='09-01-2023', freq='M')).apply(lambda row: row.month_name()).tolist() 
gasprice_by_city: 'DataFrame' = pd.pivot_table(data=wa_prices_df, index=['month_name'], columns=['city'], aggfunc='mean', values='price').replace({np.nan: 0.0}).transpose()[sort_months]

pd.DataFrame( gasprices_df.query("state == 'WA' ").groupby('season')['price'].mean().replace({0.0: np.nan}).dropna()).rename({'price': 'Gasoline Price in WA'}, axis=1)

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot()
sns.heatmap(gasprice_by_city, annot=True, center=True ,ax=ax, cmap=ListedColormap(sns.color_palette(palette='flare', n_colors=70)), linewidths=0.7, linecolor='black')

ax.set_title("AVG Gasoline Price by the City", fontweight='bold', fontsize=20)
ax.set_xticklabels(ax.get_xticklabels(), fontweight='bold')
ax.set_yticklabels(ax.get_yticklabels(), fontweight='bold')

plt.tight_layout()

fig = plt.figure(figsize=(25,8)) 
ax = fig.add_subplot(1,3,1) 
gasprices_df.query("state == 'WA' ").groupby('transc_date')['price'].mean().plot(ax=ax, marker='s', mec='black')
ax.set_title("Gasoline Historical Price in WA", fontweight='bold', fontsize=20)
ax.set_ylabel("$/Gallon", fontweight='bold')

ax = fig.add_subplot(1,3,2) 
gasprices_df.query("state == 'WA' ").groupby('transc_date')['price'].mean().resample("W").mean().interpolate().plot(ax=ax, marker='s', mec='black')
ax.set_title("Gasoline Weekly Historical Price in WA", fontweight='bold', fontsize=20)
ax.set_ylabel("$/Gallon", fontweight='bold')

ax = fig.add_subplot(1,3,3) 
gasprices_df.query("state == 'WA' ").groupby('transc_date')['price'].mean().resample("M").mean().interpolate().plot(ax=ax, marker='s', mec='black')
ax.set_title("Gasoline Monthly Historical Price in WA", fontweight='bold', fontsize=20)
ax.set_ylabel("$/Gallon", fontweight='bold')

plt.tight_layout()

stl: 'STL' = STL(gasprices_df.query("state == 'WA' ").groupby('transc_date')['price'].mean(), period=365).fit() 
fig = plt.figure(figsize=(20,8)) 
ax = fig.add_subplot(2,2,1)
stl.observed.plot(ax=ax) 
ax.set_title("WA Observed Gasoline Prices", fontweight='bold', fontsize=20) 

ax = fig.add_subplot(2,2,2)
stl.trend.plot(ax=ax) 
ax.set_title("WA Trend Gasoline Prices", fontweight='bold', fontsize=20) 

ax = fig.add_subplot(2,2,3)
stl.seasonal.plot(ax=ax) 
ax.set_title("WA Seasonal Gasoline Prices", fontweight='bold', fontsize=20) 

ax = fig.add_subplot(2,2,4)
stl.resid.plot(ax=ax) 
ax.set_title("WA Residual Gasoline Prices", fontweight='bold', fontsize=20) 

plt.tight_layout()

daily_price: 'Series' = wa_prices_df.groupby('transc_date')['price'].mean()
daily_price.index = pd.to_datetime(daily_price.index)

# the decaying acf functions shows that there is a partial autocorrelation
fig = plt.figure(figsize=(12,4)) 
ax = fig.add_subplot(1,2,1)
acf = plot_acf(daily_price, ax=ax)

ax = fig.add_subplot(1,2,2)
pacf = plot_pacf(daily_price, ax=ax) # AR(1)

adf_one: 'adfuller' = adfuller(daily_price) 
pd.DataFrame( {'ADF':adf_one[0], 
               'P-Value': adf_one[1]}, 
             index=['ADF Test']).transpose()

adf_two: 'adfuler' = adfuller(np.diff(daily_price, n=1))

pd.DataFrame( {'ADF':adf_two[0], 
               'P-Value': adf_two[1]}, 
             index=['ADF Test']).transpose()

logging.basicConfig(level=logging.INFO)
log: 'LOG' = logging.getLogger(__name__) 
log.setLevel(logging.DEBUG) 

mem_steps: str = f"{path}/pickles/compute_job.pkl"
train: 'Series' = daily_price.sort_index().iloc[:-14]
test: 'Series' = daily_price.sort_index().iloc[-14:]

d: int = 1 
D: int = 0 
s: int = 365 
orders: List[tuple] = list(product(range(3),range(3),range(3),range(3)))
results: defaultdict = defaultdict(list)


for p,q,P,Q in orders: 
    
    if os.path.exists(mem_steps):
        current_job: 'DataFrame' = pd.read_pickle(mem_steps) 
        if (True in current_job.apply(lambda row: True if row['p'] == p and row['q'] == q  and row['P'] == P  and row['Q'] == Q else False, axis=1).tolist()):
            continue 
        
    model: SARIMAX = ARIMA(endog=train, 
                            order=(p,d,q),
                            seasonal_order=(P,D,Q,s), 
                            # enforce_stationarity=False
                            ).fit( cov_type='none', low_memory=True) 
                            #method='cg') # SARIMAX

    results['p'].append(p)
    results['q'].append(q)
    results['P'].append(P)
    results['Q'].append(Q)
    results['AIC'].append(model.aic)
    log.debug("[ + ] (%s,%s,%s,%s) AIC:%s" % (p,q,P,Q,model.aic ) )
    
    if os.path.exists(mem_steps):
        temp: List[Dict] = pd.DataFrame(results).to_dict(orient='records') 
        temp.extend(pd.read_pickle(mem_steps).to_dict(orient='records'))
        pd.DataFrame(temp).drop_duplicates(['p', 'q', 'P', 'Q', 'AIC'] ).to_pickle(mem_steps)

    else: 
        current_job = pd.DataFrame(results) 
        current_job.to_pickle(mem_steps)

DEBUG:__main__:[ + ] (2,1,2,0) AIC:-205.79998795407818
DEBUG:__main__:[ + ] (2,1,2,1) AIC:-203.7999960153758
DEBUG:__main__:[ + ] (2,1,2,2) AIC:-201.799999977549
DEBUG:__main__:[ + ] (2,2,0,0) AIC:-218.48257369948615
DEBUG:__main__:[ + ] (2,2,0,1) AIC:-216.48256084068456
DEBUG:__main__:[ + ] (2,2,0,2) AIC:-214.48259766285858
DEBUG:__main__:[ + ] (2,2,1,0) AIC:-216.4826017333369
DEBUG:__main__:[ + ] (2,2,1,1) AIC:-214.48260172434667
DEBUG:__main__:[ + ] (2,2,1,2) AIC:-212.48246039543835
DEBUG:__main__:[ + ] (2,2,2,0) AIC:-214.48256125980177
DEBUG:__main__:[ + ] (2,2,2,1) AIC:-212.48259922583176
DEBUG:__main__:[ + ] (2,2,2,2) AIC:-210.48257705355968

pd.read_pickle(mem_steps).sort_values(by='AIC').head(25)

p,d,q = 2,1,2
P,D,Q,s = 0,0,0,365

model: ARIMA = ARIMA(endog=train.sort_index(),
                     order=(p,d,q),
                     seasonal_order=(P,D,Q,s)).fit( cov_type='none', low_memory=True)

model.summary()

model_one = model.plot_diagnostics(figsize=(25,10)) #

acorr_ljungbox(model.resid)

fig = plt.figure()
ax = fig.add_subplot() 
acorr_ljungbox(model.resid, lags=20)['lb_pvalue'].rename({'lb_pvalue': 'p_value'}).plot(ax=ax, marker='s', mec='black') 
ax.set_ylabel("P-Values", fontweight='bold')
ax.set_xlabel("Lags", fontweight='bold') 

ax.set_title("Ljungbox Test ARIMA Model", fontweight='bold', fontsize=20)

plt.tight_layout()

model_one_forecast: 'DataFrame'  = pd.DataFrame( model.forecast(steps=5) )
model_one_forecast.index = pd.date_range(start='2023-08-03', end=pd.Series(pd.to_datetime("2023-08-02") +  pd.offsets.BDay(3)).astype(str).tolist()[0] , freq='D') 
pd.concat([ model_one_forecast, test.head(5) ], axis=1)

WINDOW: int = 1 
last_known_price: List = [] 
model_one: List = [] 

for index in range(len(train), daily_price.shape[0], WINDOW):
    model: ARIMA = ARIMA(endog=train.sort_index(),
                         order=(p,d,q),
                         seasonal_order=(P,D,Q,s)).fit( cov_type='none', low_memory=True)     
    
    last_known_price.append(daily_price.iloc[index-WINDOW: index].tolist()[0] )
    model_one.append(model.get_prediction(0, index + WINDOW  - 1).predicted_mean.iloc[-WINDOW:].tolist()[0]) # gasoline_price in WA @day 
    
gasprice_wa: 'DataFrame' = pd.DataFrame({'actual_price': test.tolist(),
                                          'bench_mark': last_known_price, 
                                          'model_one': model_one 
                                         })

gasprice_wa.index = test.index 
gasprice_wa

fig = plt.figure(figsize=(12,4)) 
ax = fig.add_subplot() 
gasprice_wa.plot(ax=ax, marker='s', mec='black') 
ax.set_title("Gasoline Prices in Washington State", fontweight='bold', fontsize=20)
ax.set_ylabel("$/Gallon", fontweight='bold')
ax.set_xlabel("Transcation Date", fontweight='bold')
ax.legend(bbox_to_anchor=(1,1), title="Model Comparison", title_fontsize='medium', shadow=True)

plt.tight_layout()

y_true: 'Series' = test 
y_pred: 'Series' = model_one 
y_pred_one: 'Series' = last_known_price

pd.DataFrame({'model_one': metrics.mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred), 
              'bench_mark':  metrics.mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred_one)}, index=['MAPE']).transpose()

pd.DataFrame({'model_one': metrics.mean_absolute_error(y_true=y_true, y_pred=y_pred), 
              'bench_mark':  metrics.mean_absolute_error(y_true=y_true, y_pred=y_pred_one)}, index=['MAE']).transpose()

	index	gas_station	country	city	address	zip_code	state	regular_gas	midgrade_gas	premium_gas	...	latitude	longitude	timestamp	rvp	season	price	review	review_date	sentiment_score	transc_date
0	0	1st Place Food Mart	US	Tillmans Corner	7000 Three Notch-Kroner Rd	36619	AL	1	1	1	...	30.589132	-88.205224	2023-05-21T12:10:31.001Z	no	Spring	2.82	love the cheaper gas.	2022-05-02T18:39:36.45Z	1.00000	2023-05-21
1	1	Zippy Mart	US	Tillmans Corner	5936 Three Notch Rd	36619	AL	1	1	1	...	30.589263	-88.179333	2023-05-22T00:19:42.007Z	no	Spring	2.82	GOOD SERVICE	2017-01-23T09:40:24.88Z	0.57020	2023-05-22
2	2	AAFES	US	Fort McClellan	1167 Fremont Rd	36205	AL	1	1	1	...	33.731789	-85.791281	2023-05-21T19:29:16.272Z	no	Spring	2.82	Military and family members only. They earned ...	2019-06-27T20:13:50.033Z	-0.19800	2023-05-21
3	3	Sam's Club	US	Auburn	2335 Bent Creek Rd	36830	AL	1	0	1	...	32.608063	-85.428061	2023-05-22T02:21:02.515Z	no	Spring	2.84	Generally best price in town.	2022-11-02T19:20:19.87Z	0.81845	2023-05-22
4	4	Sunoco	US	Leeds	7501 Parkway Dr	35094	AL	1	0	1	...	33.544691	-86.554397	2023-05-21T13:38:20.812Z	no	Spring	2.82	great service and amazing Race Fuels	2020-06-18T14:01:04.797Z	0.76800	2023-05-21

	gas_station_name	pay_status	posted_time	address	zip_code	payment_type	price
0	Swinomish Markets at the Casio	1	2023-08-04 18:23:19.927	12939 Casino Dr	98221-8351	cash	4.15
1	Swinomish Markets at the Casio	1	2023-08-04 18:23:19.927	12939 Casino Dr	98221-8351	cash	4.15
2	Hwy 97 Truck Plaza	1	2023-08-05 01:41:02.793	61313 Hwy 97	98948	credit_card	4.15
3	Hwy 97 Truck Plaza	1	2023-08-05 01:41:02.793	61313 Hwy 97	98948	credit_card	4.15
4	Meridian Liquor Store	1	2023-08-05 12:14:32.347	4209 Meridian St	98226-5512	cash	4.39

payment_type	cash	credit_card
gas_station_name
76	4.3	N/A
ARCO	4.33	N/A
Angel of the Winds Fuel	N/A	4.45
Costco	N/A	4.3
Harolds Market	N/A	4.1
Hwy 97 Truck Plaza	N/A	4.3
JK Gas & Groceries	4.39	N/A
Little Mountain Grocery	4.4	N/A
Maverik	N/A	4.37
Meridian Liquor Store	4.44	N/A
Midway Gas & Food	4.42	N/A
Safeway	4.35	4.36
Samcor Fuel	N/A	4.39
Sinclair	N/A	4.3
Super Duper	4.45	N/A
Super Gas	N/A	4.47
Sure Save Grocery	N/A	4.49
Swinomish Markets at the Casio	4.2	N/A
Swinomish Markets at the Villa	4.25	N/A
The Tote	N/A	4.4
Topp Mart	N/A	4.29
Walmart	N/A	4.33
Wheelers Kountry Korner	4.14	N/A
Yakamart	N/A	4.2
Yakima Ave. Quick Stop	4.39	N/A

	p	q	P	Q	AIC
3	2	2	0	0	-218.482574
6	2	2	1	0	-216.482602
4	2	2	0	1	-216.482561
7	2	2	1	1	-214.482602
5	2	2	0	2	-214.482598
9	2	2	2	0	-214.482561
10	2	2	2	1	-212.482599
8	2	2	1	2	-212.482460
11	2	2	2	2	-210.482577
30	0	1	0	0	-209.339313
75	2	0	0	0	-209.040195
66	1	2	0	0	-208.699817
48	1	0	0	0	-208.633456
39	0	2	0	0	-208.593247
57	1	1	0	0	-207.656525
84	2	1	0	0	-207.548823
31	0	1	0	1	-207.339313
33	0	1	1	0	-207.339313
78	2	0	1	0	-207.040195
76	2	0	0	1	-207.040184
69	1	2	1	0	-206.699919
67	1	2	0	1	-206.699856
51	1	0	1	0	-206.633456
49	1	0	0	1	-206.633456
42	0	2	1	0	-206.593247

Dep. Variable:	price	No. Observations:	71
Model:	ARIMA(2, 1, 2)	Log Likelihood	114.241
Date:	Wed, 16 Aug 2023	AIC	-218.483
Time:	18:32:57	BIC	-207.240
Sample:	0	HQIC	-214.017
	- 71
Covariance Type:	Not computed

ODIN Project¶

Project Resources¶

DB Auth¶

Fuel Taxes¶

Gasoline Mixtures¶

Remove Duplicates Entries from DB1¶

Explore Historical Gasoline Prices by States¶

Boxplot Gasoline Prices by States¶

Histogram Gasoline Prices by States¶

Average Gasoline Prices and Number of Visitors¶

Average Gasoline Prices by States and Gas Stations¶

Trends: Total Average Gasoline Price in the U.S¶

Modeling Gasoline Prices in Washington State using SARIMAX¶

Autocorrelation and Partial Autocorrelation Plot¶

Adfuller Test¶

ARIMA Model: Gasoline Prices in WA¶

ARIMA Diagnostic Plots¶

Ljungbox Test for ARIMA Model¶

Forecast V.S Actual¶

Rolling Forecast¶

Model Evaluation¶

	coef	std err	z	P>\|z\|	[0.025	0.975]
ar.L1	0.0219	nan	nan	nan	nan	nan
ar.L2	-0.6730	nan	nan	nan	nan	nan
ma.L1	-0.4471	nan	nan	nan	nan	nan
ma.L2	0.8773	nan	nan	nan	nan	nan
sigma2	0.0022	nan	nan	nan	nan	nan

Ljung-Box (L1) (Q):	0.01	Jarque-Bera (JB):	2.15
Prob(Q):	0.91	Prob(JB):	0.34
Heteroskedasticity (H):	0.73	Skew:	-0.35
Prob(H) (two-sided):	0.46	Kurtosis:	3.49

	lb_stat	lb_pvalue
1	0.043151	0.835441
2	0.077116	0.962176
3	0.090288	0.992977
4	0.110965	0.998517
5	0.127165	0.999707
6	0.128713	0.999958
7	0.133871	0.999994
8	0.135984	0.999999
9	0.190538	1.000000
10	0.208261	1.000000

	predicted_mean	price
2023-08-03	4.280549	4.289740
2023-08-04	4.301888	4.327059
2023-08-05	4.322686	4.348551
2023-08-06	4.308781	4.293377
2023-08-07	4.294479	4.302000

	ADF Test
ADF	-1.482823
P-Value	0.541991

	ADF Test
ADF	-3.756059
P-Value	0.003390

	actual_price	bench_mark	model_one
transc_date
2023-08-03	4.289740	4.310758	4.280549
2023-08-04	4.327059	4.289740	4.301888
2023-08-05	4.348551	4.327059	4.322686
2023-08-06	4.293377	4.348551	4.308781
2023-08-07	4.302000	4.293377	4.294479
2023-08-08	4.319149	4.302000	4.303523
2023-08-09	4.337143	4.319149	4.313346
2023-08-10	4.351569	4.337143	4.307475
2023-08-11	4.357353	4.351569	4.300736
2023-08-12	4.367761	4.357353	4.304539
2023-08-13	4.359437	4.367761	4.309158
2023-08-14	4.311438	4.359437	4.306700
2023-08-15	4.334505	4.311438	4.303537
2023-08-16	4.245902	4.334505	4.305122

	MAPE
model_one	0.007120
bench_mark	0.006264