PADD 1 District¶
from odin_eia_db.engine.odin_spark import ODINSpark
from eia.utils.aws_athena import AWSAthena
from pyspark.sql import functions as f
import pandas as pd
import numpy as np
import warnings
from odin_eia_db.utils.constants import INVERSE_US_REGIONS, US_REGIONS, PADD_AREAS
from eia.utils.constants import INVERSE_STATES
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict
plt.style.use('ggplot')
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 200
Constant Vars¶
MONTH_LOOKUP: Dict = { x:pd.to_datetime(f"1900-{x}-01").month_name() for x in range(1,13) }
SEASON_LOOKUP: Dict = {'Spring': range(3,6),
'Summer': range(6,10),
'Fall': range(9,12),
'Winter': range(1,3)
}
Helper Funcitons¶
@f.udf
def get_transc_date(transc_date) -> str:
try:
return pd.to_datetime(transc_date, format="ISO8601").strftime("%Y-%m-%d")
except:
return "N/A"
@f.udf
def get_padd_district(state_name: str):
# Get PADD District loc based on state_name
return PADD_AREAS.get( INVERSE_US_REGIONS.get( INVERSE_STATES.get(state_name) ) , 'N/A' )
@f.udf
def get_month_name(month: int):
return MONTH_LOOKUP.get(month, np.nan)
@f.udf
def get_state_name(state: str):
return INVERSE_STATES.get(state, 'N/A')
ODIN Project Helper¶
odin_athena: AWSAthena = AWSAthena()
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
ODIN Athena¶
- Retrieve gasoline legacy pricing
- Retrieve current gasoline pricing
query: str = """
WITH legacy_pricing AS
( SELECT
ROW_NUMBER() OVER() AS "id",
col1 AS "gas_station",
col2 AS "country",
col3 AS "city",
col4 AS "address",
col5 AS "zip_code",
col6 AS "state",
col7 AS "regular_gas",
col8 AS "midgrade_gas",
col9 AS "premium_gas",
col10 AS "price_unit",
col11 AS "star_rating",
col12 AS "latitude",
col13 AS "longitude",
col14 AS "timestamp",
col15 AS "price",
col16 AS "review",
col17 AS "review_date",
col18 AS "sentiment_score"
FROM live_gasoline_prices l )
SELECT
l.*
FROM legacy_pricing l
WHERE l.id > 1
"""
query_execution_id: str = odin_athena.execute_query(db='odin_db', query=query)
s3_path: str = query_execution_id.get('s3_path')
# consume using ODIN-Spark
TaskForce ODIN¶
- Current Gasoline Pricing
query: str = """
SELECT
l.gas_station_name AS `gas_station`,
l.postedtime AS `timestamp`,
DATE_FORMAT(l.postedtime , '%Y-%m-%d' ) AS `transc_date`,
CONCAT(l.address,', ', l.city, ', ', l.state, ', ', l.zip_code) AS `store_loc`,
IFNULL( l.price, 0.0) AS `price`,
IFNULL( l.longitude, 0.0) AS `longitude`,
IFNULL( l.latitude, 0.0) AS `latitude`,
IFNULL( l.star_rating, 0.0) AS `star_rating`,
l.state,
CASE
WHEN DATE_FORMAT( CONCAT('2024', '-', MONTH(l.postedtime), '-', DAY(l.postedtime)), '%Y-%m-%d' ) BETWEEN DATE_FORMAT("2024-06-20", '%Y-%m-%d' ) AND DATE_FORMAT("2024-09-22", '%Y-%m-%d' ) THEN 'Summer'
WHEN DATE_FORMAT( CONCAT('2024', '-', MONTH(l.postedtime), '-', DAY(l.postedtime)), '%Y-%m-%d' ) BETWEEN DATE_FORMAT("2024-09-22", '%Y-%m-%d' ) AND DATE_FORMAT("2024-12-20", '%Y-%m-%d' ) THEN 'Fall'
WHEN DATE_FORMAT( CONCAT('2024', '-', MONTH(l.postedtime), '-', DAY(l.postedtime)), '%Y-%m-%d' ) BETWEEN DATE_FORMAT("2024-12-21", '%Y-%m-%d' ) AND DATE_FORMAT("2024-03-20", '%Y-%m-%d' ) THEN 'Winter'
WHEN DATE_FORMAT( CONCAT('2024', '-', MONTH(l.postedtime), '-', DAY(l.postedtime)), '%Y-%m-%d' ) BETWEEN DATE_FORMAT("2024-03-21", '%Y-%m-%d' ) AND DATE_FORMAT("2024-06-19", '%Y-%m-%d' ) THEN 'Spring'
END AS `season`
FROM gasoline_pricing l
WHERE IFNULL(l.postedtime, 'N/A') != 'N/A'
"""
gasoline_pricing: 'ODINSpark' = ODINSpark(app_name='gasoline_pricing').exec_odin_mysql_query(query=query)
gasoline_pricing = gasoline_pricing.drop_duplicates(subset=['timestamp', 'latitude', 'longitude' ] )
gasoline_pricing = gasoline_pricing.withColumn('price', f.col('price').cast('double').alias('price'))
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials DEBUG:odin_eia_db.engine.odin_spark: ..` ./sdmds:` +dmmds+-` -+ymmmmmmmmdo:` ./osyyy/` `ommmmmmmmdhso:` ./shmmmmmmmmmmmmmmds/. ./ohmmmdmdmm/ +mmmmmmmmmmmmmmmhy+//:://///++shdmmmmmmmmmmhsdmmmmmmmmmddyso+/::--.-://+syhdmmmmmmmmmmmmh` :dmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm+ .hdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmsooodmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd. ommmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmh+osoymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm/ :dmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+oohmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmms `hdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdsoo+ososmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd. /dmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmyoo++ooooymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmo `ydmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmhoooooooooodmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd` :mmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+o+ooooosmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm/ ommmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmy+o+o+o+o+oooymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmy ydmmmdddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmhooo+ooo+oo+ooodmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd. .dmmmddddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+oo+o+ooo+oosmmmmmmmmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmd: -dmmmmddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmsoo+oooo+oo+oooo+hmmmmmmmmmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmdo +dmmmmdmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmh+oo+oooooooooooooodmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdy sdmmmmddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+ooooo+o+oooooo+smmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdh ymmmmddsoosssssyyyyyyyhhhhhhhhhhhhhhhhhs+o+o++o+o++oooooooo+hhhddddddddddddddmdmmmddmdmdmdmdmmmmmmmd hmmmmdddyo+++++++++ooo+++o+oooooooooooooo+oosyyyhhhddhdddddyssssoooooooooooooooooooosoooooooymmmmmmd hmmmmddmmdhso++++++++o++o+++ooo++++o+++ohhdddmddddddddddddddddddhy+++o++oo+oooooooooo+++++shdmmmmmmh ymmmmdddmmmmdyso++++o+o+ooooooooooo+oshdmddddddddmddddddddddddhyso+oooooo++o++o+o++o++oshdmmmmmmmmdh ydmmmddmmmmmdmmdyso+o++oooooooo+++o+ddddddmddddmddddddddddddddddhyo+oo+o+++o+++++++oshddmmmmmmmmmmdy odmmmdddmdmddmmmmmhyoo++o+o+ooo+o+osddmmdddddddddddddddddddddddhyso++o++++++++++ooydmmmmmdmmmmmmmmdo /dmmmdddmddmdmmmmmmmdysoo++++oo+o+oddddmdddddddddddddddddddddddddyo++++o++++++oyddmmmmmmmmmmmmmmmmd/ .dmmmmdddddmddmmmmmmmmmdyso+++o+ooyddmmdmddddddddmddddddddddddddysoooo++++ooyhdmmdmmmmmmmmmmmmmmmdd` `ydmmmdddddmmddmmmmmmmmmmdhso++++sdddmmmmdddddddmmddddddddddddmddyo+++++oyhdmmmmmmmmmmmmmmmmmmmmmds +dmmmdddddddddmmmmmmmmmmmmmdysoohdmdmmmmmdddddddmmdddddddddddddhyo++oyhdmmmmmmmmmmmmmmmmmmmmmmmmd: -dmmmddddddddmmmmmmmmmmmmmmmmdhooyddmdmmmddddmdddddddddddddddddy+oyhdmmmmmmmmmmmmmmmmmmmmmdmmmmdy` `ydmmmdddddddddddmmmmmmmmmmmmmyoohddddmmmdddmmmddmddmdddddddddds+sdmmmmmmmmmmmmmmmmmmmmmmmmmmmmd/ :dmmmddddmddddddmmmmmmmmmmmddoo+yddddddmdmmmmmdddddddddddddddddy+ymmmmmmdmmmmmdmmmmmmmmmmmmmmdh` sdmmmddddddmddmmmmmmmmmmmmdso+oosyyyydddmmddddddddddddddddddyos+odmmmmmddmmmmmmmmdmmmmmmmmmmdo -dmmmdddddddmdmmmmmmmmdmmmyoo++o+o++oyddmdddddddddddddddddddh+o++sdmmmmdmmmmmmmmmmmmmmmmmmmmy` odmmdddddddmmmmmmmmmmddmhoooooo+oo++ydddddhhyosssdddddddddso+++o+hmdmmmmmmmmmmmmdmmdmmmmmmd- `hdmmmdddddmdmmmdmmmmdddoo++oo+++o++hddhso+++++++oydddddddyo+++o+oddmmmmmmdmmmmmmmmmmmmmmdo -hdmmdddddmdddddmddmmdy++++oo++++o+hys++ooshyso+++yddddyso++++o++ydmmmmmmmmmdmmmmddmmmmdy` /ddmdddddddmdddmmmmmh+o+++oo+++++++++oshdddddhyoooyhddy++++++++++ddmdmmmdmdmmmmmddmmmdh- `sdmmmdddddddmdmmmmdo++++oo++++++++oydddmdddddddhyoosyo+++++o++o+ydddddddmdmmmmmmmmmmd: `yddmmdddddmmmmmmms++++o++++++ooyddddmmdmdmdddmdddys+++++++o+++oodddmddddddmmddmmmdd+ -ydmmmddddddddmmh++++++o++oshddddddmmmdddddddddddddhyoo+++o+++o+sdddmddmmmddddmmmd+ .hdmmmdmddddddds++++++osydmmdddddmmddddddddddddddddddhso++++++oohdmmmddddddmmmddo` -ydmmmmdddddds++++ooyhdmmdmmmddddmdddmdmddddddmddddddddyso+++++odddmddddddmmddo` .ydmmmddddddoo+oyhdmmmmmdmmmddddddddmdddmddddddddddddddddyo++++sddmdddddmmdd+` `sdmmmmmdds+shddmdmmmmmmddmdddddddddddddddddddddddddddddddhs+++hdddddmmmdd+` `+ddmmmddyhddmmmmmmmmmdddmdddddddddddddddddddddddddddddddmdhsosddddmmmdh/ :hdmmmddddddmmmmdmmmddddddddddddddddddddddddddddddddddddddddddddmmmdh- .sdmmmdddmmmmmmmdmdmmddddddddddddddddddddddddddddddddddddddddmmmddy. /hdmmmmmmmmmddddddmddddddddddddddddddddddddddddddddddddddmmmmdh/` .sddmmmmddddddddddddddddddddddddddddddddddddddddddddddddddddo. /hdmdmmdmddddddmdmddddddddddddddddddddddddddddddddddddddo- .odddmdmddddddmdmddddddddddddddddmdddddddddddddmdddddo- .+hddmdmddddmdddddddddddddddddddddddddddddddddddh+. `/yddddmddddmddddddddddddddddddmddddddddddddy/` `:sddddddddddddddddddddddddmddddddddmddds: ./ydddddddddddddddddddddddddddddddy/` -+hdddddddddddddddddddddddddy+. `-ohdddddddddddddddddddy/. -+yddddddddddddyo-` .:oyddddhs/. ▄▄▄▄ ▄▄▄▄▄ ▄▄▄▄▄▄ ▄▄▄ ▄▄ ▄▄▄▄ ▄▄▄▄▄▄ ▄▄ ▄▄▄▄▄▄ ▄▄ ▄▄▄ ██▀▀██ ██▀▀▀██ ▀▀██▀▀ ███ ██ ▄█▀▀▀▀█ ██▀▀▀▀█▄ ████ ██▀▀▀▀██ ██ ██▀ ██ ██ ██ ██ ██ ██▀█ ██ ██▄ ██ ██ ████ ██ ██ ██▄██ ██ ██ ██ ██ ██ ██ ██ ██ ▀████▄ ██████▀ ██ ██ ███████ █████ ██ ██ ██ ██ ██ ██ █▄██ █████ ▀██ ██ ██████ ██ ▀██▄ ██ ██▄ ██▄▄██ ██▄▄▄██ ▄▄██▄▄ ██ ███ █▄▄▄▄▄█▀ ██ ▄██ ██▄ ██ ██ ██ ██▄ ▀▀▀▀ ▀▀▀▀▀ ▀▀▀▀▀▀ ▀▀ ▀▀▀ ▀▀▀▀▀ ▀▀ ▀▀ ▀▀ ▀▀ ▀▀▀ ▀▀ ▀▀ [ OK ] Launching ODIN @2024-08-14T19:35:38.302220 DEBUG:odin_eia_db.engine.odin_spark:[ OK ] Loading Spark Configuration For ODIN-Project...
:: loading settings :: url = jar:file:/opt/spark-3.5.1-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ab2ec321-17ef-4028-8654-4625952ed070;1.0
confs: [default]
found org.apache.hadoop#hadoop-aws;3.3.1 in central
found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 400ms :: artifacts dl 11ms
:: modules in use:
com.amazonaws#aws-java-sdk-bundle;1.11.901 from central in [default]
org.apache.hadoop#hadoop-aws;3.3.1 from central in [default]
org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
---------------------------------------------------------------------
| | modules || artifacts |
| conf | number| search|dwnlded|evicted|| number|dwnlded|
---------------------------------------------------------------------
| default | 3 | 0 | 0 | 0 || 3 | 0 |
---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-ab2ec321-17ef-4028-8654-4625952ed070
confs: [default]
0 artifacts copied, 3 already retrieved (0kB/5ms)
24/08/14 19:35:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
DEBUG:odin_eia_db.engine.odin_spark:[ OK ] Loading Hadoop Configuration For ODIN-Project...
gasoline_pricing.printSchema()
root |-- gas_station: string (nullable = true) |-- timestamp: timestamp (nullable = true) |-- transc_date: string (nullable = true) |-- store_loc: string (nullable = true) |-- price: double (nullable = true) |-- longitude: double (nullable = true) |-- latitude: double (nullable = true) |-- star_rating: double (nullable = true) |-- state: string (nullable = true) |-- season: string (nullable = true)
gasoline_pricing.show(5)
[Stage 19:> (0 + 1) / 1]
+--------------+-------------------+-----------+--------------------+-----+-----------+---------+-----------+-----+------+ | gas_station| timestamp|transc_date| store_loc|price| longitude| latitude|star_rating|state|season| +--------------+-------------------+-----------+--------------------+-----+-----------+---------+-----------+-----+------+ |Murphy Express|2024-08-02 00:00:14| 2024-08-02|221 S White Sands...| 2.86|-105.959781|32.888533| 0.0| NM|Summer| |Murphy Express|2024-08-02 00:00:16| 2024-08-02|199 Maryport Dr, ...| 2.85| -78.956583|33.635091| 0.0| SC|Summer| |Murphy Express|2024-08-02 00:00:16| 2024-08-02|13443 South Hamil...| 3.15|-111.976158|40.506858| 0.0| UT|Summer| | Murphy USA|2024-08-02 00:00:21| 2024-08-02|210 Haynes Street...| 2.82| -86.081775|33.439887| 0.0| AL|Summer| | Murphy USA|2024-08-02 00:00:21| 2024-08-02|1807 E Highland D...| 2.81| -90.685234|35.821236| 0.0| AR|Summer| +--------------+-------------------+-----------+--------------------+-----+-----------+---------+-----------+-----+------+ only showing top 5 rows
- ODIN Legacy Gasoline Pricing
query: str = """
SELECT
l.gas_station,
l.timestamp,
DATE_FORMAT(l.timestamp , '%Y-%m-%d' ) AS `transc_date`,
CONCAT(l.address,', ', l.city, ', ', l.state, ', ', l.zip_code) AS `store_loc`,
IFNULL( l.price, 0.0) AS `price`,
IFNULL( l.longitude, 0.0) AS `longitude`,
IFNULL( l.latitude, 0.0) AS `latitude`,
l.star_rating,
l.state,
/* naive season labeling */
CASE
WHEN DATE_FORMAT( CONCAT('2024', '-', MONTH(l.timestamp), '-', DAY(l.timestamp)), '%Y-%m-%d' ) BETWEEN DATE_FORMAT("2024-06-20", '%Y-%m-%d' ) AND DATE_FORMAT("2024-09-22", '%Y-%m-%d' ) THEN 'Summer'
WHEN DATE_FORMAT( CONCAT('2024', '-', MONTH(l.timestamp), '-', DAY(l.timestamp)), '%Y-%m-%d' ) BETWEEN DATE_FORMAT("2024-09-22", '%Y-%m-%d' ) AND DATE_FORMAT("2024-12-20", '%Y-%m-%d' ) THEN 'Fall'
WHEN DATE_FORMAT( CONCAT('2024', '-', MONTH(l.timestamp), '-', DAY(l.timestamp)), '%Y-%m-%d' ) BETWEEN DATE_FORMAT("2024-12-21", '%Y-%m-%d' ) AND DATE_FORMAT("2025-03-20", '%Y-%m-%d' ) THEN 'Winter'
WHEN DATE_FORMAT( CONCAT('2024', '-', MONTH(l.timestamp), '-', DAY(l.timestamp)), '%Y-%m-%d' ) BETWEEN DATE_FORMAT("2024-03-21", '%Y-%m-%d' ) AND DATE_FORMAT("2024-06-19", '%Y-%m-%d' ) THEN 'Spring'
ELSE 'Winter'
END AS `season`
FROM legacy_gasoline_pricing l
WHERE IFNULL(l.timestamp, 'N/A') != 'N/A'
"""
legacy_pricing: 'ODINSpark' = ODINSpark(app_name='gasoline_legacy_pricing').exec_odin_mysql_query(query=query)
legacy_pricing = legacy_pricing.drop_duplicates(subset=['timestamp', 'latitude', 'longitude' ] )
legacy_pricing = legacy_pricing.withColumn('price', f.col('price').cast('double').alias('price'))
DEBUG:odin_eia_db.engine.odin_spark: ..` ./sdmds:` +dmmds+-` -+ymmmmmmmmdo:` ./osyyy/` `ommmmmmmmdhso:` ./shmmmmmmmmmmmmmmds/. ./ohmmmdmdmm/ +mmmmmmmmmmmmmmmhy+//:://///++shdmmmmmmmmmmhsdmmmmmmmmmddyso+/::--.-://+syhdmmmmmmmmmmmmh` :dmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm+ .hdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmsooodmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd. ommmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmh+osoymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm/ :dmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+oohmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmms `hdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdsoo+ososmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd. /dmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmyoo++ooooymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmo `ydmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmhoooooooooodmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd` :mmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+o+ooooosmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm/ ommmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmy+o+o+o+o+oooymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmy ydmmmdddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmhooo+ooo+oo+ooodmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd. .dmmmddddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+oo+o+ooo+oosmmmmmmmmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmd: -dmmmmddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmsoo+oooo+oo+oooo+hmmmmmmmmmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmdo +dmmmmdmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmh+oo+oooooooooooooodmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdy sdmmmmddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+ooooo+o+oooooo+smmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdh ymmmmddsoosssssyyyyyyyhhhhhhhhhhhhhhhhhs+o+o++o+o++oooooooo+hhhddddddddddddddmdmmmddmdmdmdmdmmmmmmmd hmmmmdddyo+++++++++ooo+++o+oooooooooooooo+oosyyyhhhddhdddddyssssoooooooooooooooooooosoooooooymmmmmmd hmmmmddmmdhso++++++++o++o+++ooo++++o+++ohhdddmddddddddddddddddddhy+++o++oo+oooooooooo+++++shdmmmmmmh ymmmmdddmmmmdyso++++o+o+ooooooooooo+oshdmddddddddmddddddddddddhyso+oooooo++o++o+o++o++oshdmmmmmmmmdh ydmmmddmmmmmdmmdyso+o++oooooooo+++o+ddddddmddddmddddddddddddddddhyo+oo+o+++o+++++++oshddmmmmmmmmmmdy odmmmdddmdmddmmmmmhyoo++o+o+ooo+o+osddmmdddddddddddddddddddddddhyso++o++++++++++ooydmmmmmdmmmmmmmmdo /dmmmdddmddmdmmmmmmmdysoo++++oo+o+oddddmdddddddddddddddddddddddddyo++++o++++++oyddmmmmmmmmmmmmmmmmd/ .dmmmmdddddmddmmmmmmmmmdyso+++o+ooyddmmdmddddddddmddddddddddddddysoooo++++ooyhdmmdmmmmmmmmmmmmmmmdd` `ydmmmdddddmmddmmmmmmmmmmdhso++++sdddmmmmdddddddmmddddddddddddmddyo+++++oyhdmmmmmmmmmmmmmmmmmmmmmds +dmmmdddddddddmmmmmmmmmmmmmdysoohdmdmmmmmdddddddmmdddddddddddddhyo++oyhdmmmmmmmmmmmmmmmmmmmmmmmmd: -dmmmddddddddmmmmmmmmmmmmmmmmdhooyddmdmmmddddmdddddddddddddddddy+oyhdmmmmmmmmmmmmmmmmmmmmmdmmmmdy` `ydmmmdddddddddddmmmmmmmmmmmmmyoohddddmmmdddmmmddmddmdddddddddds+sdmmmmmmmmmmmmmmmmmmmmmmmmmmmmd/ :dmmmddddmddddddmmmmmmmmmmmddoo+yddddddmdmmmmmdddddddddddddddddy+ymmmmmmdmmmmmdmmmmmmmmmmmmmmdh` sdmmmddddddmddmmmmmmmmmmmmdso+oosyyyydddmmddddddddddddddddddyos+odmmmmmddmmmmmmmmdmmmmmmmmmmdo -dmmmdddddddmdmmmmmmmmdmmmyoo++o+o++oyddmdddddddddddddddddddh+o++sdmmmmdmmmmmmmmmmmmmmmmmmmmy` odmmdddddddmmmmmmmmmmddmhoooooo+oo++ydddddhhyosssdddddddddso+++o+hmdmmmmmmmmmmmmdmmdmmmmmmd- `hdmmmdddddmdmmmdmmmmdddoo++oo+++o++hddhso+++++++oydddddddyo+++o+oddmmmmmmdmmmmmmmmmmmmmmdo -hdmmdddddmdddddmddmmdy++++oo++++o+hys++ooshyso+++yddddyso++++o++ydmmmmmmmmmdmmmmddmmmmdy` /ddmdddddddmdddmmmmmh+o+++oo+++++++++oshdddddhyoooyhddy++++++++++ddmdmmmdmdmmmmmddmmmdh- `sdmmmdddddddmdmmmmdo++++oo++++++++oydddmdddddddhyoosyo+++++o++o+ydddddddmdmmmmmmmmmmd: `yddmmdddddmmmmmmms++++o++++++ooyddddmmdmdmdddmdddys+++++++o+++oodddmddddddmmddmmmdd+ -ydmmmddddddddmmh++++++o++oshddddddmmmdddddddddddddhyoo+++o+++o+sdddmddmmmddddmmmd+ .hdmmmdmddddddds++++++osydmmdddddmmddddddddddddddddddhso++++++oohdmmmddddddmmmddo` -ydmmmmdddddds++++ooyhdmmdmmmddddmdddmdmddddddmddddddddyso+++++odddmddddddmmddo` .ydmmmddddddoo+oyhdmmmmmdmmmddddddddmdddmddddddddddddddddyo++++sddmdddddmmdd+` `sdmmmmmdds+shddmdmmmmmmddmdddddddddddddddddddddddddddddddhs+++hdddddmmmdd+` `+ddmmmddyhddmmmmmmmmmdddmdddddddddddddddddddddddddddddddmdhsosddddmmmdh/ :hdmmmddddddmmmmdmmmddddddddddddddddddddddddddddddddddddddddddddmmmdh- .sdmmmdddmmmmmmmdmdmmddddddddddddddddddddddddddddddddddddddddmmmddy. /hdmmmmmmmmmddddddmddddddddddddddddddddddddddddddddddddddmmmmdh/` .sddmmmmddddddddddddddddddddddddddddddddddddddddddddddddddddo. /hdmdmmdmddddddmdmddddddddddddddddddddddddddddddddddddddo- .odddmdmddddddmdmddddddddddddddddmdddddddddddddmdddddo- .+hddmdmddddmdddddddddddddddddddddddddddddddddddh+. `/yddddmddddmddddddddddddddddddmddddddddddddy/` `:sddddddddddddddddddddddddmddddddddmddds: ./ydddddddddddddddddddddddddddddddy/` -+hdddddddddddddddddddddddddy+. `-ohdddddddddddddddddddy/. -+yddddddddddddyo-` .:oyddddhs/. ▄▄▄▄ ▄▄▄▄▄ ▄▄▄▄▄▄ ▄▄▄ ▄▄ ▄▄▄▄ ▄▄▄▄▄▄ ▄▄ ▄▄▄▄▄▄ ▄▄ ▄▄▄ ██▀▀██ ██▀▀▀██ ▀▀██▀▀ ███ ██ ▄█▀▀▀▀█ ██▀▀▀▀█▄ ████ ██▀▀▀▀██ ██ ██▀ ██ ██ ██ ██ ██ ██▀█ ██ ██▄ ██ ██ ████ ██ ██ ██▄██ ██ ██ ██ ██ ██ ██ ██ ██ ▀████▄ ██████▀ ██ ██ ███████ █████ ██ ██ ██ ██ ██ ██ █▄██ █████ ▀██ ██ ██████ ██ ▀██▄ ██ ██▄ ██▄▄██ ██▄▄▄██ ▄▄██▄▄ ██ ███ █▄▄▄▄▄█▀ ██ ▄██ ██▄ ██ ██ ██ ██▄ ▀▀▀▀ ▀▀▀▀▀ ▀▀▀▀▀▀ ▀▀ ▀▀▀ ▀▀▀▀▀ ▀▀ ▀▀ ▀▀ ▀▀ ▀▀▀ ▀▀ ▀▀ [ OK ] Launching ODIN @2024-08-14T19:36:03.234524 DEBUG:odin_eia_db.engine.odin_spark:[ OK ] Loading Spark Configuration For ODIN-Project... DEBUG:odin_eia_db.engine.odin_spark:[ OK ] Loading Hadoop Configuration For ODIN-Project...
legacy_pricing.printSchema()
root |-- gas_station: string (nullable = true) |-- timestamp: string (nullable = true) |-- transc_date: string (nullable = true) |-- store_loc: string (nullable = true) |-- price: double (nullable = true) |-- longitude: string (nullable = true) |-- latitude: string (nullable = true) |-- star_rating: string (nullable = true) |-- state: string (nullable = true) |-- season: string (nullable = true)
legacy_pricing.show(5)
[Stage 30:> (0 + 1) / 1]
+-----------+--------------------+-----------+--------------------+-----+-----------------+---------------+-----------+-----+------+ |gas_station| timestamp|transc_date| store_loc|price| longitude| latitude|star_rating|state|season| +-----------+--------------------+-----------+--------------------+-----+-----------------+---------------+-----------+-----+------+ |Hanson Mart|2023-05-19T16:42:...| 2023-05-19|1713 Hanson Rd, E...| 3.19| -76.312236785888|39.424574738401| 3.7| MD|Spring| | Safeway|2023-05-19T16:45:...| 2023-05-19|91-1119 Keaunui D...| 4.57|-158.021469712257| 21.32794650025| 3.8| HI|Spring| | Chevron|2023-05-19T21:28:...| 2023-05-19|4860 Bill Gardner...| 2.89| -84.120844602584|33.353470102796| 3.7| GA|Spring| | Raceway|2023-05-20T12:30:...| 2023-05-20|641 US-1 S, Ediso...| 3.16| -74.39227938652|40.507942465825| 4.0| NJ|Spring| | Royal Mart|2023-05-20T12:32:...| 2023-05-20|56 2nd St, Highsp...| 3.29| -76.79547| 40.21131| 3.6| PA|Spring| +-----------+--------------------+-----------+--------------------+-----+-----------------+---------------+-----------+-----+------+ only showing top 5 rows
Merged Gasoline Pricing¶
odin_gasoline_pricing = gasoline_pricing.union(legacy_pricing)
odin_gasoline_pricing = odin_gasoline_pricing.withColumn('padd_district', get_padd_district('state') ) # PADD DISTRICT REGION
odin_gasoline_pricing = odin_gasoline_pricing.withColumn('month', f.month('transc_date') )
odin_gasoline_pricing = odin_gasoline_pricing.withColumn('month_name', get_month_name('month') )
odin_gasodin_gasoline_pricingoline_pricing = odin_gasoline_pricing.withColumn('state_name', get_state_name('state') )
odin_gasoline_pricing.show(5)
+--------------+-------------------+-----------+--------------------+-----+-----------+---------+-----------+-----+------+-------------+-----+----------+ | gas_station| timestamp|transc_date| store_loc|price| longitude| latitude|star_rating|state|season|padd_district|month|month_name| +--------------+-------------------+-----------+--------------------+-----+-----------+---------+-----------+-----+------+-------------+-----+----------+ |Murphy Express|2024-08-02 00:00:14| 2024-08-02|221 S White Sands...| 2.86|-105.959781|32.888533| 0.0| NM|Summer| PADD 5| 8| August| |Murphy Express|2024-08-02 00:00:16| 2024-08-02|199 Maryport Dr, ...| 2.85| -78.956583|33.635091| 0.0| SC|Summer| PADD 3| 8| August| |Murphy Express|2024-08-02 00:00:16| 2024-08-02|13443 South Hamil...| 3.15|-111.976158|40.506858| 0.0| UT|Summer| PADD 5| 8| August| | Murphy USA|2024-08-02 00:00:20| 2024-08-02|2010 West Grant S...| 2.78| -97.249409| 34.73856| 0.0| OK|Summer| PADD 3| 8| August| | Murphy USA|2024-08-02 00:00:21| 2024-08-02|210 Haynes Street...| 2.82| -86.081775|33.439887| 0.0| AL|Summer| PADD 3| 8| August| +--------------+-------------------+-----------+--------------------+-----+-----------+---------+-----------+-----+------+-------------+-----+----------+ only showing top 5 rows
odin_gasoline_pricing.printSchema()
root |-- gas_station: string (nullable = true) |-- timestamp: string (nullable = true) |-- transc_date: string (nullable = true) |-- store_loc: string (nullable = true) |-- price: double (nullable = true) |-- longitude: string (nullable = true) |-- latitude: string (nullable = true) |-- star_rating: string (nullable = true) |-- state: string (nullable = true) |-- season: string (nullable = true) |-- padd_district: string (nullable = true) |-- month: integer (nullable = true) |-- month_name: string (nullable = true)
Distribution of Gasoline Price in the US¶
gas_prices = odin_gasoline_pricing.select('price').rdd.flatMap(lambda row: row).collect()
fig = plt.figure(figsize=(12,4))
ax = fig.add_subplot()
sns.distplot(gas_prices, ax=ax)
ax.set_title("Distribution of Gasoline Price in the US", fontweight='bold', fontsize=14)
mean: float = round( np.mean(gas_prices) , 2 )
median: float = round( np.median(gas_prices) , 2 )
ymin, ymax = ax.get_ylim()
ax.axvline(x=mean, ymin=ymin, ymax=ymax, linestyle="--", color='black', label=f"Mean Gas Price: ${mean}")
ax.axvline(x=median, ymin=ymin, ymax=ymax, linestyle="-.", color='blue', label=f"Median Gas Price: ${median}")
ax.legend(title='Stats Desc', fancybox=True, shadow=True)
plt.tight_layout()
Daily Average Gasoline Price in the US¶
avg_daily_prices: List[Dict] = [ item.asDict() for item in odin_gasoline_pricing.groupby('transc_date').agg(f.round( f.avg('price').alias('avg_price') , 2).alias('avg_price') ).orderBy('transc_date').collect() ]
avg_price: 'DataFrame' = pd.DataFrame( avg_daily_prices ).iloc[1:]
avg_price['transc_date'] = pd.to_datetime(avg_price['transc_date'] )
fig = plt.figure()
ax = fig.add_subplot()
avg_price.set_index('transc_date')['avg_price'].plot(figsize=(12,4), ax=ax) # some desc July,2023
ax.set_title("Daily Average Gasoline Price in the US", fontweight='bold', fontsize=14)
ax.set_ylabel("$/Gallon", fontweight='bold')
plt.tight_layout()
Average Gas Price by Padd District and Season¶
odin_gasoline_pricing.filter("padd_district != 'N/A' ").filter("padd_district != 'Not Applicable' ").groupby('padd_district').pivot('season').agg(f.round(f.avg('price') , 2).alias('avg_price') ).show()
[Stage 158:======================================> (2 + 1) / 3]
+-------------+----+------+------+------+ |padd_district|Fall|Spring|Summer|Winter| +-------------+----+------+------+------+ | PADD 1| 3.1| 3.26| 3.36| 2.92| | PADD 2|2.87| 3.08| 3.22| 2.64| | PADD 3|2.69| 2.91| 3.01| 2.61| | PADD 5|3.42| 3.56| 3.75| 2.9| +-------------+----+------+------+------+
Average Gas Price by Padd District and Month¶
padd_district: 'DataFrame' = pd.DataFrame([ item.asDict() for item in odin_gasoline_pricing.groupby('padd_district').pivot('month').mean('price').alias('avg_price').collect() ] )
padd_district.rename({ str(k):v for k,v in MONTH_LOOKUP.items()}, axis=1).set_index('padd_district').applymap(lambda item: round(item, 2)).iloc[:, 1:].sort_index().iloc[2:, :]
# avg_price increase
January | February | March | April | May | June | July | August | September | October | November | December | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
padd_district | ||||||||||||
PADD 1 | 2.84 | 2.95 | 3.06 | 3.24 | 3.30 | 3.26 | 3.24 | 3.41 | 3.45 | 3.21 | 3.02 | 2.91 |
PADD 2 | 2.47 | 2.68 | 2.97 | 3.08 | 3.06 | 3.10 | 3.10 | 3.27 | 3.35 | 3.04 | 2.78 | 2.54 |
PADD 3 | 2.47 | 2.65 | 2.85 | 2.95 | 2.91 | 2.87 | 2.86 | 3.06 | 3.12 | 2.83 | 2.59 | 2.46 |
PADD 5 | 2.77 | 2.86 | 3.26 | 3.62 | 3.54 | 3.60 | 3.68 | 3.73 | 3.92 | 3.70 | 3.25 | 2.92 |
Average Gasoline Price in Warm/Hot Weather for PADD 1 District¶
- Connecticut
- Delaware
- Maine
- Maryland
- Massachusetts
- New Hampshire
- New Jersey
- New York
- Pennsylvania
- Rhode Island
- Vermont
padd1_pricing_df: 'DataFrame' = pd.read_excel("/mnt/odin_repo/misc/PET_PRI_GND_DCUS_R10_W.xls", sheet_name='Data 1')
padd1_df: 'DataFrame' = pd.DataFrame( [ item.asDict() for item in odin_gasoline_pricing.filter("padd_district == 'PADD 5' ").groupby('transc_date').agg(f.round( f.avg('price'), 2).alias('avg_price') ).orderBy('transc_date').collect() ] )
padd1_df['transc_date'] = pd.to_datetime( padd1_df['transc_date'] )
padd1_pricing_df.columns = padd1_pricing_df.iloc[1].tolist()
padd1_pricing_df = padd1_pricing_df.iloc[2:]
padd1_pricing_df['Date'] = pd.to_datetime(padd1_pricing_df['Date'] )
padd1_pricing_df = padd1_pricing_df.set_index('Date').replace({np.nan:0.0} ).astype(float)
cols: List[str] = ['Weekly East Coast Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'Weekly East Coast Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'Weekly East Coast Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)' ]
padd1_pricing_df = padd1_pricing_df[ cols].rename(dict( zip(cols, ['Regular', 'Midgrade', 'Premium' ],)), axis=1)
padd1_pricing_df['month_name'] = pd.Series(padd1_pricing_df.index).apply(lambda row: row.month_name() )
padd1_pricing_df['month_name'] = pd.Series(padd1_pricing_df.index).apply(lambda row: row.month_name() ).tolist()
padd1_pricing_df['season'] = pd.Series( padd1_pricing_df.index).apply(lambda row: 'Winter' if (row.month in SEASON_LOOKUP.get('Winter') or row.month == 12) else 'Fall' if row.month in SEASON_LOOKUP.get('Fall') else 'Spring' if row.month in SEASON_LOOKUP.get('Spring') else 'Summer' ).tolist()
padd1_pricing_df['year'] = pd.Series(padd1_pricing_df.index).apply(lambda row: row.year).tolist()
Average Gas Price in PADD District by (Gasoline Formulation
)¶
fig = plt.figure(figsize=(12, 6))
ax = fig.add_subplot()
padd1_pricing_df["2000":].plot(ax=ax) # TODO: decompose by season, month
ax.set_title("Weekly Gas Price in PADD 1 District", fontweight='bold', fontsize=14)
ax.set_ylabel("$/Gallon", fontweight='bold' )
ax.legend(title="Gasoline Formulations", fancybox=True, shadow=True)
plt.tight_layout()
Average Gas Price in PADD District¶
fig = plt.figure(figsize=(12,4))
ax = fig.add_subplot()
padd1_df.set_index('transc_date')['avg_price'].plot(ax=ax)
ax.set_title("Average Gasoline Price in PADD 1 District", fontweight='bold', fontsize=14)
ax.set_ylabel("$/Gallon", fontweight='bold')
ax.axvspan(xmin=19482, xmax=19610, hatch='/', alpha=0.2, ec='k', label='Summer 2023 Price', color='grey')
ax.axvspan(xmin=19850, xmax=ax.get_xlim()[-1], hatch='/', alpha=0.2, ec='k', label='Summer 2024 Price', color='grey')
ax.legend(title="Average Gasoline Price", fancybox=True, shadow=True, bbox_to_anchor=(1.23,1))
plt.tight_layout()
Weekly Gas Price Comparison in PADD 1
District¶
- Year: 2020 - Present (Weekly Avg Price Inc during SPR - SUM 2022)
fig = plt.figure(figsize=(12,6))
ax = fig.add_subplot()
padd1_pricing_df.query("year >= 2020").groupby(['year', 'season'] )[padd1_pricing_df.columns.tolist()[:3]].mean().map(lambda row: round(row,2) ).plot(kind='bar', ax=ax, hatch='//', ec='k')
ax.set_title("Weekly Gas Price Comparison in PADD 1 District", fontweight='bold', fontsize=14)
ax.set_ylabel("$/Gallon", fontweight='bold' )
ax.legend(bbox_to_anchor=(1.13, 1), fancybox=True, shadow=True )
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontweight='bold' )
plt.tight_layout()
PADD 1: Gas Price From 2022 - 2024¶
fig = plt.figure(figsize=(20,12))
ax = fig.add_subplot(2,2,1)
padd1_pricing_df["2021-01":"2021-12"][padd1_pricing_df.columns.tolist()[:3]].plot(ax=ax)
ax.legend(title="Gasoline Formulation", fancybox=True, shadow=True)
ax.set_title("PADD 1: Gas Price 2021", fontweight='bold')
ax.set_ylabel("$/Gallon")
ax = fig.add_subplot(2,2,2)
padd1_pricing_df["2022-01":"2022-12"][padd1_pricing_df.columns.tolist()[:3]].plot(ax=ax)
ax.legend(title="Gasoline Formulation", fancybox=True, shadow=True)
ax.set_title("PADD 1: Gas Price 2022", fontweight='bold')
ax.set_ylabel("$/Gallon")
ax = fig.add_subplot(2,2,3)
padd1_pricing_df["2023-01":"2023-12"][padd1_pricing_df.columns.tolist()[:3]].plot(ax=ax)
ax.legend(title="Gasoline Formulation", fancybox=True, shadow=True)
ax.set_title("PADD 1: Gas Price 2023", fontweight='bold')
ax.set_ylabel("$/Gallon")
ax = fig.add_subplot(2,2,4)
padd1_pricing_df["2022-01":][padd1_pricing_df.columns.tolist()[:3]].plot(ax=ax)
ax.legend(title="Gasoline Formulation", fancybox=True, shadow=True)
ax.set_title("PADD 1: Gas Price 2024", fontweight='bold')
ax.set_ylabel("$/Gallon")
plt.tight_layout()
Average Gasoline Price by Season in PADD 1 District
¶
Reid Vapor Pressure¶
The difference involves the fuel’s Reid Vapor Pressure (RVP), which is a measure of how easily the fuel evaporates at a given temperature. Winter-blend gas has a higher RVP because the fuel must be able to evaporate at low temperatures for the engine to operate properly. Summer-blend gas has a lower RVP to prevent excessive evaporation when outside temperatures rise. That evaporation can cause vapor lock in an engine on hot days and contributes to unhealthy ground-level ozone and smog levels.
It’s All About Butane, More or Less¶
Summer-blend gas contains 2 percent butane, but that percentage is higher in the winter blend. Butane is used as lighter fluid and is a secondary ingredient in fuel canisters used for gas grills and camping. As a fuel, butane is less expensive than other gasoline components, but its high volatility limits how much can be included in summer-grade fuel.
.odin_gasoline_pricing.filter("padd_district == 'PADD 1' ").groupby('state_name').pivot('season').agg(f.round(f.avg('price') , 2).alias('avg_price') ).orderBy('state_name').show()
[Stage 313:======================================> (2 + 1) / 3]
+-------------+----+------+------+------+ | state_name|Fall|Spring|Summer|Winter| +-------------+----+------+------+------+ | Connecticut|3.15| 3.34| 3.44| 2.91| | Delaware| 3.0| 3.28| 3.41| 2.96| | Maine|3.13| 3.3| 3.43| 2.94| | Maryland|3.02| 3.29| 3.32| 2.97| |Massachusetts| 3.1| 3.15| 3.29| 2.8| |New Hampshire| 3.1| 3.21| 3.33| 2.86| | New Jersey|3.02| 3.18| 3.31| 2.84| | New York|2.99| 3.13| 3.3| 2.78| | Pennsylvania|3.22| 3.31| 3.28| 3.04| | Rhode Island|3.23| 3.31| 3.44| 2.95| | Vermont|3.39| 3.41| 3.56| 3.06| +-------------+----+------+------+------+
Correlation between Season and Price in PADD 1
District¶
- Summer = 0
- Spring = 1
- Fall = 2
- Winter = 3
df1: 'DataFrame' = pd.DataFrame([ item.asDict() for item in odin_gasoline_pricing.select('season', 'price' ).collect() ] )
df1['encoded_season'] = pd.factorize(df1['season'] )[0]
%%time
ax = sns.lmplot(data=df1, x='encoded_season', y='price', scatter_kws={'alpha': 0.3}, height=6, aspect=2, palette='flare', x_jitter=0.3, y_jitter=0.3)
ax.ax.set_title("Correlations between Season and Gasoline Price in PADD 1 District", fontweight='bold', fontsize=14)
plt.tight_layout()
CPU times: user 3min 19s, sys: 1min 45s, total: 5min 5s Wall time: 2min 34s
seasonal_padd1: List[Dict] = odin_gasoline_pricing.filter("padd_district == 'PADD 1' ").select('transc_date', 'state', 'price', 'season' ).collect()
seasonal_padd1_df: 'DataFrame' = pd.DataFrame([ item.asDict() for item in seasonal_padd1 ] )
seasonal_padd1_df['state_name'] = seasonal_padd1_df['state'].apply(lambda row: INVERSE_STATES.get(row))
seasonal_padd1_df['transc_date'] = pd.to_datetime(seasonal_padd1_df['transc_date'] )
Boxplot: Seasonal Gasoline Price in PADD 1
District¶
fig = plt.figure(figsize=(20,10))
for index,state_name in enumerate( seasonal_padd1_df['state_name'].unique().tolist() , 1):
ax = fig.add_subplot(4,3,index)
seasonal_padd1_df.query(f"state_name == '{state_name}' ").boxplot(by='season', column='price', ax=ax)
ax.set_title(f"Avg Price in {state_name}", fontweight='bold', )
ax.set_ylabel("$/Gallon", fontweight='bold')
plt.tight_layout()
Monthly Gasoline Price in PADD 1
District¶
- The average gasoline price slightly increases between June - September
odin_gasoline_pricing = odin_gasoline_pricing.withColumn('state_name', get_state_name('state'))
odin_gasoline_pricing.filter("padd_district == 'PADD 1'").groupby('month').pivot('state_name').agg(f.round(f.avg('price') , 2).alias('avg_price') ).orderBy('month').show()
[Stage 73:===================> (1 + 2) / 3]
+-----+-----------+--------+-----+--------+-------------+-------------+----------+--------+------------+------------+-------+ |month|Connecticut|Delaware|Maine|Maryland|Massachusetts|New Hampshire|New Jersey|New York|Pennsylvania|Rhode Island|Vermont| +-----+-----------+--------+-----+--------+-------------+-------------+----------+--------+------------+------------+-------+ | 1| 2.82| 2.84| 2.88| 2.85| 2.74| 2.79| 2.77| 2.65| 2.98| 2.87| 3.01| | 2| 2.95| 3.01| 2.97| 3.01| 2.81| 2.87| 2.88| 2.78| 3.07| 2.97| 3.06| | 3| 3.07| 3.1| 3.06| 3.18| 2.9| 2.98| 2.97| 2.97| 3.15| 3.08| 3.17| | 4| 3.3| 3.25| 3.28| 3.39| 3.11| 3.17| 3.12| 3.08| 3.33| 3.28| 3.36| | 5| 3.39| 3.35| 3.34| 3.28| 3.19| 3.25| 3.24| 3.18| 3.34| 3.36| 3.46| | 6| 3.35| 3.26| 3.33| 3.21| 3.18| 3.24| 3.21| 3.19| 3.26| 3.33| 3.46| | 7| 3.38| 3.34| 3.36| 3.12| 3.21| 3.26| 3.25| 3.19| 3.07| 3.35| 3.49| | 8| 3.48| 3.45| 3.43| 3.46| 3.29| 3.33| 3.32| 3.3| 3.41| 3.46| 3.56| | 9| 3.53| 3.42| 3.53| 3.34| 3.41| 3.42| 3.4| 3.43| 3.49| 3.56| 3.68| | 10| 3.33| 3.03| 3.28| 3.07| 3.23| 3.27| 3.13| 3.11| 3.29| 3.38| 3.52| | 11| 3.02| 2.93| 3.06| 2.97| 2.98| 3.01| 2.95| 2.89| 3.15| 3.18| 3.31| | 12| 2.89| 2.9| 2.91| 2.89| 2.88| 2.84| 2.85| 2.77| 3.08| 2.98| 3.16| +-----+-----------+--------+-----+--------+-------------+-------------+----------+--------+------------+------------+-------+
Daily Average Gasoline Price Trends in PADD 1
District¶
fig = plt.figure(figsize=(20,10))
for index,state_name in enumerate( seasonal_padd1_df['state_name'].unique().tolist() , 1):
ax = fig.add_subplot(4,3,index)
seasonal_padd1_df.query(f"state_name == '{state_name}' ").set_index('transc_date')['price'].resample('D').mean().interpolate().plot(ax=ax)
ax.set_title(f"Daily Avg Gasoline Price Trends in {state_name}", fontweight='bold', )
ax.set_ylabel("$/Gallon", fontweight='bold')
plt.tight_layout()
Gas Price by City¶
query_execution_id: str = "64703f3f-0ac4-4941-bb53-116ec83e65d4"
s3_path: str = odin_athena.get_query_execution(query_execution_id=query_execution_id).get('s3_path')
pricing_df: 'DataFrame' = pd.read_csv(s3_path)
pricing_df['state_name'] = pricing_df['state'].apply(lambda row: INVERSE_STATES.get(row))
pricing_df['padd_district'] = pricing_df['state_name'].apply(lambda row: PADD_AREAS.get( INVERSE_US_REGIONS.get(row)) )
pricing_df = pricing_df.dropna(subset='transc_date')
pricing_df['transc_date'] = pd.to_datetime(pricing_df['transc_date'] )
padd1_pricing: 'DataFrame' = pricing_df[ pricing_df['state_name'].apply(lambda row: row in US_REGIONS.get('Northeast') ) ]
padd1_pricing['month'] = padd1_pricing['transc_date'].apply(lambda row: row.month)
padd1_pricing['season'] = padd1_pricing['month'].apply(lambda row: 'Summer' if row in SEASON_LOOKUP.get('Summer') else 'Fall' if row in SEASON_LOOKUP.get('Fall') else 'Spring' if row in SEASON_LOOKUP.get('Spring') else 'Winter' )
INFO:aiobotocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
fig = plt.figure(figsize=(25,25))
for index,state in enumerate( padd1_pricing['state_name'].unique() ,1):
ax = fig.add_subplot(4,3,index)
padd1_pricing.query(f"state_name == '{state}' ").groupby('city')['price'].mean().map(lambda row: round(row, 2)).sort_values(ascending=False).head(10).plot(kind='bar', hatch='/', ax=ax, ec='k')
ax.set_title(f"Avg Gas Price: Top 10 City in {state}", fontweight='bold', fontsize=12)
ax.set_ylabel("$/Gallon")
plt.tight_layout()
Weekly Gas Price in Massachusetts
¶
- Conventional Premium Gasoline
query: str = """
SELECT
p.period,
p.value,
p.units,
CASE
WHEN DATE_FORMAT( CONCAT('2024', '-', MONTH(p.period), '-', DAY(p.period)), '%Y-%m-%d' ) BETWEEN DATE_FORMAT("2024-06-20", '%Y-%m-%d' ) AND DATE_FORMAT("2024-09-22", '%Y-%m-%d' ) THEN 'Summer'
WHEN DATE_FORMAT( CONCAT('2024', '-', MONTH(p.period), '-', DAY(p.period)), '%Y-%m-%d' ) BETWEEN DATE_FORMAT("2024-09-22", '%Y-%m-%d' ) AND DATE_FORMAT("2024-12-20", '%Y-%m-%d' ) THEN 'Fall'
WHEN DATE_FORMAT( CONCAT('2024', '-', MONTH(p.period), '-', DAY(p.period)), '%Y-%m-%d' ) BETWEEN DATE_FORMAT("2024-03-21", '%Y-%m-%d' ) AND DATE_FORMAT("2024-06-19", '%Y-%m-%d' ) THEN 'Spring'
ELSE 'Winter'
END AS `season`
FROM pricing_massachusetts p
"""
ma_weekly_pricing: 'DataFrame' = ODINSpark(app_name="Massachusetts Weekly Gas Pricing").exec_odin_mysql_query(query=query)
ma_weekly_pricing = ma_weekly_pricing.withColumn('month', f.month('period') )
ma_weekly_pricing = ma_weekly_pricing.withColumnRenamed('value', 'price' )
ma_weekly_pricing = ma_weekly_pricing.withColumn('year', f.year('period'))
ma_weekly_pricing = ma_weekly_pricing.withColumn('price', f.col('price').cast('double') )
DEBUG:odin_eia_db.engine.odin_spark: ..` ./sdmds:` +dmmds+-` -+ymmmmmmmmdo:` ./osyyy/` `ommmmmmmmdhso:` ./shmmmmmmmmmmmmmmds/. ./ohmmmdmdmm/ +mmmmmmmmmmmmmmmhy+//:://///++shdmmmmmmmmmmhsdmmmmmmmmmddyso+/::--.-://+syhdmmmmmmmmmmmmh` :dmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm+ .hdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmsooodmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd. ommmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmh+osoymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm/ :dmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+oohmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmms `hdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdsoo+ososmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd. /dmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmyoo++ooooymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmo `ydmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmhoooooooooodmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd` :mmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+o+ooooosmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm/ ommmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmy+o+o+o+o+oooymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmy ydmmmdddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmhooo+ooo+oo+ooodmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd. .dmmmddddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+oo+o+ooo+oosmmmmmmmmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmd: -dmmmmddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmsoo+oooo+oo+oooo+hmmmmmmmmmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmdo +dmmmmdmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmh+oo+oooooooooooooodmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdy sdmmmmddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+ooooo+o+oooooo+smmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdh ymmmmddsoosssssyyyyyyyhhhhhhhhhhhhhhhhhs+o+o++o+o++oooooooo+hhhddddddddddddddmdmmmddmdmdmdmdmmmmmmmd hmmmmdddyo+++++++++ooo+++o+oooooooooooooo+oosyyyhhhddhdddddyssssoooooooooooooooooooosoooooooymmmmmmd hmmmmddmmdhso++++++++o++o+++ooo++++o+++ohhdddmddddddddddddddddddhy+++o++oo+oooooooooo+++++shdmmmmmmh ymmmmdddmmmmdyso++++o+o+ooooooooooo+oshdmddddddddmddddddddddddhyso+oooooo++o++o+o++o++oshdmmmmmmmmdh ydmmmddmmmmmdmmdyso+o++oooooooo+++o+ddddddmddddmddddddddddddddddhyo+oo+o+++o+++++++oshddmmmmmmmmmmdy odmmmdddmdmddmmmmmhyoo++o+o+ooo+o+osddmmdddddddddddddddddddddddhyso++o++++++++++ooydmmmmmdmmmmmmmmdo /dmmmdddmddmdmmmmmmmdysoo++++oo+o+oddddmdddddddddddddddddddddddddyo++++o++++++oyddmmmmmmmmmmmmmmmmd/ .dmmmmdddddmddmmmmmmmmmdyso+++o+ooyddmmdmddddddddmddddddddddddddysoooo++++ooyhdmmdmmmmmmmmmmmmmmmdd` `ydmmmdddddmmddmmmmmmmmmmdhso++++sdddmmmmdddddddmmddddddddddddmddyo+++++oyhdmmmmmmmmmmmmmmmmmmmmmds +dmmmdddddddddmmmmmmmmmmmmmdysoohdmdmmmmmdddddddmmdddddddddddddhyo++oyhdmmmmmmmmmmmmmmmmmmmmmmmmd: -dmmmddddddddmmmmmmmmmmmmmmmmdhooyddmdmmmddddmdddddddddddddddddy+oyhdmmmmmmmmmmmmmmmmmmmmmdmmmmdy` `ydmmmdddddddddddmmmmmmmmmmmmmyoohddddmmmdddmmmddmddmdddddddddds+sdmmmmmmmmmmmmmmmmmmmmmmmmmmmmd/ :dmmmddddmddddddmmmmmmmmmmmddoo+yddddddmdmmmmmdddddddddddddddddy+ymmmmmmdmmmmmdmmmmmmmmmmmmmmdh` sdmmmddddddmddmmmmmmmmmmmmdso+oosyyyydddmmddddddddddddddddddyos+odmmmmmddmmmmmmmmdmmmmmmmmmmdo -dmmmdddddddmdmmmmmmmmdmmmyoo++o+o++oyddmdddddddddddddddddddh+o++sdmmmmdmmmmmmmmmmmmmmmmmmmmy` odmmdddddddmmmmmmmmmmddmhoooooo+oo++ydddddhhyosssdddddddddso+++o+hmdmmmmmmmmmmmmdmmdmmmmmmd- `hdmmmdddddmdmmmdmmmmdddoo++oo+++o++hddhso+++++++oydddddddyo+++o+oddmmmmmmdmmmmmmmmmmmmmmdo -hdmmdddddmdddddmddmmdy++++oo++++o+hys++ooshyso+++yddddyso++++o++ydmmmmmmmmmdmmmmddmmmmdy` /ddmdddddddmdddmmmmmh+o+++oo+++++++++oshdddddhyoooyhddy++++++++++ddmdmmmdmdmmmmmddmmmdh- `sdmmmdddddddmdmmmmdo++++oo++++++++oydddmdddddddhyoosyo+++++o++o+ydddddddmdmmmmmmmmmmd: `yddmmdddddmmmmmmms++++o++++++ooyddddmmdmdmdddmdddys+++++++o+++oodddmddddddmmddmmmdd+ -ydmmmddddddddmmh++++++o++oshddddddmmmdddddddddddddhyoo+++o+++o+sdddmddmmmddddmmmd+ .hdmmmdmddddddds++++++osydmmdddddmmddddddddddddddddddhso++++++oohdmmmddddddmmmddo` -ydmmmmdddddds++++ooyhdmmdmmmddddmdddmdmddddddmddddddddyso+++++odddmddddddmmddo` .ydmmmddddddoo+oyhdmmmmmdmmmddddddddmdddmddddddddddddddddyo++++sddmdddddmmdd+` `sdmmmmmdds+shddmdmmmmmmddmdddddddddddddddddddddddddddddddhs+++hdddddmmmdd+` `+ddmmmddyhddmmmmmmmmmdddmdddddddddddddddddddddddddddddddmdhsosddddmmmdh/ :hdmmmddddddmmmmdmmmddddddddddddddddddddddddddddddddddddddddddddmmmdh- .sdmmmdddmmmmmmmdmdmmddddddddddddddddddddddddddddddddddddddddmmmddy. /hdmmmmmmmmmddddddmddddddddddddddddddddddddddddddddddddddmmmmdh/` .sddmmmmddddddddddddddddddddddddddddddddddddddddddddddddddddo. /hdmdmmdmddddddmdmddddddddddddddddddddddddddddddddddddddo- .odddmdmddddddmdmddddddddddddddddmdddddddddddddmdddddo- .+hddmdmddddmdddddddddddddddddddddddddddddddddddh+. `/yddddmddddmddddddddddddddddddmddddddddddddy/` `:sddddddddddddddddddddddddmddddddddmddds: ./ydddddddddddddddddddddddddddddddy/` -+hdddddddddddddddddddddddddy+. `-ohdddddddddddddddddddy/. -+yddddddddddddyo-` .:oyddddhs/. ▄▄▄▄ ▄▄▄▄▄ ▄▄▄▄▄▄ ▄▄▄ ▄▄ ▄▄▄▄ ▄▄▄▄▄▄ ▄▄ ▄▄▄▄▄▄ ▄▄ ▄▄▄ ██▀▀██ ██▀▀▀██ ▀▀██▀▀ ███ ██ ▄█▀▀▀▀█ ██▀▀▀▀█▄ ████ ██▀▀▀▀██ ██ ██▀ ██ ██ ██ ██ ██ ██▀█ ██ ██▄ ██ ██ ████ ██ ██ ██▄██ ██ ██ ██ ██ ██ ██ ██ ██ ▀████▄ ██████▀ ██ ██ ███████ █████ ██ ██ ██ ██ ██ ██ █▄██ █████ ▀██ ██ ██████ ██ ▀██▄ ██ ██▄ ██▄▄██ ██▄▄▄██ ▄▄██▄▄ ██ ███ █▄▄▄▄▄█▀ ██ ▄██ ██▄ ██ ██ ██ ██▄ ▀▀▀▀ ▀▀▀▀▀ ▀▀▀▀▀▀ ▀▀ ▀▀▀ ▀▀▀▀▀ ▀▀ ▀▀ ▀▀ ▀▀ ▀▀▀ ▀▀ ▀▀ [ OK ] Launching ODIN @2024-08-24T15:49:29.080684 DEBUG:odin_eia_db.engine.odin_spark:[ OK ] Loading Spark Configuration For ODIN-Project... DEBUG:odin_eia_db.engine.odin_spark:[ OK ] Loading Hadoop Configuration For ODIN-Project...
ma_weekly_pricing.printSchema()
root |-- period: string (nullable = true) |-- price: string (nullable = true) |-- units: string (nullable = true) |-- season: string (nullable = true) |-- month: integer (nullable = true) |-- year: integer (nullable = true)
Boxplot: Weekly Conventional Premium Gasoline¶
ma_weekly_pricing.groupby('year').pivot('season').agg(f.round(f.avg('price'), 2) ).orderBy('year').na.fill(0.0).show(50)
+----+----+------+------+------+ |year|Fall|Spring|Summer|Winter| +----+----+------+------+------+ |1994|1.29| 0.0| 0.0| 1.27| |1995|1.25| 1.33| 1.33| 1.25| |1996|1.38| 1.4| 1.37| 1.28| |1997|1.34| 1.36| 1.36| 1.39| |1998|1.15| 1.2| 1.17| 1.21| |1999|1.41| 1.22| 1.32| 1.11| |2000|1.63| 1.63| 1.67| 1.53| |2001|1.35| 1.7| 1.54| 1.53| |2002|1.56| 1.53| 1.52| 1.33| |2003|1.67| 1.66| 1.72| 1.71| |2004|2.13| 2.03| 2.04| 1.83| |2005|2.69| 2.35| 2.67| 2.11| |2006|2.47| 2.99| 3.05| 2.55| |2007|3.18| 3.12| 3.06| 2.63| |2008|2.78| 3.89| 4.12| 3.13| |2009|2.84| 2.47| 2.81| 2.24| |2010|3.09| 3.04| 2.9| 2.99| |2011|3.66| 4.01| 3.88| 3.48| |2012|3.88| 3.98| 3.86| 3.84| |2013|3.71| 3.86| 3.89| 3.91| |2014| 3.4| 4.03| 3.88| 3.66| |2015|2.64| 2.95| 2.95| 2.67| |2016|2.76| 2.67| 2.69| 2.41| |2017|3.05| 2.88| 2.91| 2.86| |2018|3.19| 3.3| 3.31| 3.05| |2019|3.07| 3.24| 3.13| 2.9| |2020|2.69| 2.5| 2.71| 2.95| |2021|3.87| 3.44| 3.62| 3.13| |2022|4.07| 4.87| 4.71| 4.07| |2023| 4.0| 4.14| 4.28| 4.02| |2024| 0.0| 4.21| 4.17| 3.93| +----+----+------+------+------+
fig = plt.figure(figsize=(20,5))
for index,year in enumerate(sorted(ma_df.query("year >= 2020")['year'].unique()), 1):
ax = fig.add_subplot(1,5,index)
ma_df.query(f"year == {year}").boxplot(by='season', column='price', ax=ax)
ax.set_ylabel("$/Gallon")
ax.set_title(f"{year}", fontweight='bold', fontsize=12)
plt.tight_layout()
Trends in Weekly Conventional Premium Gasoline¶
ma_df: 'DataFrame' = pd.DataFrame([ item.asDict() for item in ma_weekly_pricing.select('period', 'price', 'season', 'month').collect() ] )
ma_df['period'] = pd.to_datetime(ma_df['period'] )
ma_df['year'] = ma_df['period'].apply(lambda row: row.year )
fig = plt.figure(figsize=(12,4))
ax = fig.add_subplot()
ma_df.query("year > 2020").set_index("period")['price'].plot(ax=ax)
ax.set_title("Weekly Conventional Premium Gasoline in Massachusetts", fontweight='bold', fontsize=14)
ax.set_ylabel("$/Gallon")
plt.tight_layout()
Crude Oils Imports to the US¶
query: str = """
SELECT
DISTINCT
c.originName,
c.destinationName,
c.period,
c.gradeName,
c.quantity,
c.start_date,
c.end_date,
c.destinationTypeName,
c.quantity_units
FROM crude_oil_imports c
"""
crude_oils: 'ODINSpark' = ODINSpark(app_name='Crude Oils Imports').exec_odin_mysql_query(query=query)
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials DEBUG:odin_eia_db.engine.odin_spark: ..` ./sdmds:` +dmmds+-` -+ymmmmmmmmdo:` ./osyyy/` `ommmmmmmmdhso:` ./shmmmmmmmmmmmmmmds/. ./ohmmmdmdmm/ +mmmmmmmmmmmmmmmhy+//:://///++shdmmmmmmmmmmhsdmmmmmmmmmddyso+/::--.-://+syhdmmmmmmmmmmmmh` :dmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm+ .hdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmsooodmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd. ommmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmh+osoymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm/ :dmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+oohmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmms `hdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdsoo+ososmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd. /dmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmyoo++ooooymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmo `ydmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmhoooooooooodmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd` :mmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+o+ooooosmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm/ ommmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmy+o+o+o+o+oooymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmy ydmmmdddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmhooo+ooo+oo+ooodmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd. .dmmmddddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+oo+o+ooo+oosmmmmmmmmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmd: -dmmmmddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmsoo+oooo+oo+oooo+hmmmmmmmmmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmdo +dmmmmdmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmh+oo+oooooooooooooodmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdy sdmmmmddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+ooooo+o+oooooo+smmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdh ymmmmddsoosssssyyyyyyyhhhhhhhhhhhhhhhhhs+o+o++o+o++oooooooo+hhhddddddddddddddmdmmmddmdmdmdmdmmmmmmmd hmmmmdddyo+++++++++ooo+++o+oooooooooooooo+oosyyyhhhddhdddddyssssoooooooooooooooooooosoooooooymmmmmmd hmmmmddmmdhso++++++++o++o+++ooo++++o+++ohhdddmddddddddddddddddddhy+++o++oo+oooooooooo+++++shdmmmmmmh ymmmmdddmmmmdyso++++o+o+ooooooooooo+oshdmddddddddmddddddddddddhyso+oooooo++o++o+o++o++oshdmmmmmmmmdh ydmmmddmmmmmdmmdyso+o++oooooooo+++o+ddddddmddddmddddddddddddddddhyo+oo+o+++o+++++++oshddmmmmmmmmmmdy odmmmdddmdmddmmmmmhyoo++o+o+ooo+o+osddmmdddddddddddddddddddddddhyso++o++++++++++ooydmmmmmdmmmmmmmmdo /dmmmdddmddmdmmmmmmmdysoo++++oo+o+oddddmdddddddddddddddddddddddddyo++++o++++++oyddmmmmmmmmmmmmmmmmd/ .dmmmmdddddmddmmmmmmmmmdyso+++o+ooyddmmdmddddddddmddddddddddddddysoooo++++ooyhdmmdmmmmmmmmmmmmmmmdd` `ydmmmdddddmmddmmmmmmmmmmdhso++++sdddmmmmdddddddmmddddddddddddmddyo+++++oyhdmmmmmmmmmmmmmmmmmmmmmds +dmmmdddddddddmmmmmmmmmmmmmdysoohdmdmmmmmdddddddmmdddddddddddddhyo++oyhdmmmmmmmmmmmmmmmmmmmmmmmmd: -dmmmddddddddmmmmmmmmmmmmmmmmdhooyddmdmmmddddmdddddddddddddddddy+oyhdmmmmmmmmmmmmmmmmmmmmmdmmmmdy` `ydmmmdddddddddddmmmmmmmmmmmmmyoohddddmmmdddmmmddmddmdddddddddds+sdmmmmmmmmmmmmmmmmmmmmmmmmmmmmd/ :dmmmddddmddddddmmmmmmmmmmmddoo+yddddddmdmmmmmdddddddddddddddddy+ymmmmmmdmmmmmdmmmmmmmmmmmmmmdh` sdmmmddddddmddmmmmmmmmmmmmdso+oosyyyydddmmddddddddddddddddddyos+odmmmmmddmmmmmmmmdmmmmmmmmmmdo -dmmmdddddddmdmmmmmmmmdmmmyoo++o+o++oyddmdddddddddddddddddddh+o++sdmmmmdmmmmmmmmmmmmmmmmmmmmy` odmmdddddddmmmmmmmmmmddmhoooooo+oo++ydddddhhyosssdddddddddso+++o+hmdmmmmmmmmmmmmdmmdmmmmmmd- `hdmmmdddddmdmmmdmmmmdddoo++oo+++o++hddhso+++++++oydddddddyo+++o+oddmmmmmmdmmmmmmmmmmmmmmdo -hdmmdddddmdddddmddmmdy++++oo++++o+hys++ooshyso+++yddddyso++++o++ydmmmmmmmmmdmmmmddmmmmdy` /ddmdddddddmdddmmmmmh+o+++oo+++++++++oshdddddhyoooyhddy++++++++++ddmdmmmdmdmmmmmddmmmdh- `sdmmmdddddddmdmmmmdo++++oo++++++++oydddmdddddddhyoosyo+++++o++o+ydddddddmdmmmmmmmmmmd: `yddmmdddddmmmmmmms++++o++++++ooyddddmmdmdmdddmdddys+++++++o+++oodddmddddddmmddmmmdd+ -ydmmmddddddddmmh++++++o++oshddddddmmmdddddddddddddhyoo+++o+++o+sdddmddmmmddddmmmd+ .hdmmmdmddddddds++++++osydmmdddddmmddddddddddddddddddhso++++++oohdmmmddddddmmmddo` -ydmmmmdddddds++++ooyhdmmdmmmddddmdddmdmddddddmddddddddyso+++++odddmddddddmmddo` .ydmmmddddddoo+oyhdmmmmmdmmmddddddddmdddmddddddddddddddddyo++++sddmdddddmmdd+` `sdmmmmmdds+shddmdmmmmmmddmdddddddddddddddddddddddddddddddhs+++hdddddmmmdd+` `+ddmmmddyhddmmmmmmmmmdddmdddddddddddddddddddddddddddddddmdhsosddddmmmdh/ :hdmmmddddddmmmmdmmmddddddddddddddddddddddddddddddddddddddddddddmmmdh- .sdmmmdddmmmmmmmdmdmmddddddddddddddddddddddddddddddddddddddddmmmddy. /hdmmmmmmmmmddddddmddddddddddddddddddddddddddddddddddddddmmmmdh/` .sddmmmmddddddddddddddddddddddddddddddddddddddddddddddddddddo. /hdmdmmdmddddddmdmddddddddddddddddddddddddddddddddddddddo- .odddmdmddddddmdmddddddddddddddddmdddddddddddddmdddddo- .+hddmdmddddmdddddddddddddddddddddddddddddddddddh+. `/yddddmddddmddddddddddddddddddmddddddddddddy/` `:sddddddddddddddddddddddddmddddddddmddds: ./ydddddddddddddddddddddddddddddddy/` -+hdddddddddddddddddddddddddy+. `-ohdddddddddddddddddddy/. -+yddddddddddddyo-` .:oyddddhs/. ▄▄▄▄ ▄▄▄▄▄ ▄▄▄▄▄▄ ▄▄▄ ▄▄ ▄▄▄▄ ▄▄▄▄▄▄ ▄▄ ▄▄▄▄▄▄ ▄▄ ▄▄▄ ██▀▀██ ██▀▀▀██ ▀▀██▀▀ ███ ██ ▄█▀▀▀▀█ ██▀▀▀▀█▄ ████ ██▀▀▀▀██ ██ ██▀ ██ ██ ██ ██ ██ ██▀█ ██ ██▄ ██ ██ ████ ██ ██ ██▄██ ██ ██ ██ ██ ██ ██ ██ ██ ▀████▄ ██████▀ ██ ██ ███████ █████ ██ ██ ██ ██ ██ ██ █▄██ █████ ▀██ ██ ██████ ██ ▀██▄ ██ ██▄ ██▄▄██ ██▄▄▄██ ▄▄██▄▄ ██ ███ █▄▄▄▄▄█▀ ██ ▄██ ██▄ ██ ██ ██ ██▄ ▀▀▀▀ ▀▀▀▀▀ ▀▀▀▀▀▀ ▀▀ ▀▀▀ ▀▀▀▀▀ ▀▀ ▀▀ ▀▀ ▀▀ ▀▀▀ ▀▀ ▀▀ [ OK ] Launching ODIN @2024-08-17T12:46:27.371798 DEBUG:odin_eia_db.engine.odin_spark:[ OK ] Loading Spark Configuration For ODIN-Project...
:: loading settings :: url = jar:file:/opt/spark-3.5.1-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c7099faa-4fbb-4400-b4d8-c5dfb8a129a3;1.0
confs: [default]
found org.apache.hadoop#hadoop-aws;3.3.1 in central
found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 451ms :: artifacts dl 14ms
:: modules in use:
com.amazonaws#aws-java-sdk-bundle;1.11.901 from central in [default]
org.apache.hadoop#hadoop-aws;3.3.1 from central in [default]
org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
---------------------------------------------------------------------
| | modules || artifacts |
| conf | number| search|dwnlded|evicted|| number|dwnlded|
---------------------------------------------------------------------
| default | 3 | 0 | 0 | 0 || 3 | 0 |
---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-c7099faa-4fbb-4400-b4d8-c5dfb8a129a3
confs: [default]
0 artifacts copied, 3 already retrieved (0kB/5ms)
24/08/17 12:46:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
DEBUG:odin_eia_db.engine.odin_spark:[ OK ] Loading Hadoop Configuration For ODIN-Project...
crude_oils.printSchema()
root |-- originName: string (nullable = true) |-- destinationName: string (nullable = true) |-- gradeName: string (nullable = true) |-- quantity: long (nullable = true) |-- start_date: string (nullable = true) |-- end_date: string (nullable = true) |-- destinationTypeName: string (nullable = true) |-- quantity_units: string (nullable = true)
crude_oils.show(5)
+----------+---------------+-----------+--------+-------------------+-------------------+-------------------+----------------+ |originName|destinationName| gradeName|quantity| start_date| end_date|destinationTypeName| quantity_units| +----------+---------------+-----------+--------+-------------------+-------------------+-------------------+----------------+ | Algeria| United States|Light Sweet| 376|2024-05-01 00:00:00|2024-05-31 00:00:00| United States|thousand barrels| | World| United States| Medium| 60583|2024-05-01 00:00:00|2024-05-31 00:00:00| United States|thousand barrels| | World| United States|Light Sweet| 13151|2024-05-01 00:00:00|2024-05-31 00:00:00| United States|thousand barrels| | World| United States| Light Sour| 9427|2024-05-01 00:00:00|2024-05-31 00:00:00| United States|thousand barrels| | World| United States|Heavy Sweet| 10436|2024-05-01 00:00:00|2024-05-31 00:00:00| United States|thousand barrels| +----------+---------------+-----------+--------+-------------------+-------------------+-------------------+----------------+ only showing top 5 rows
Top Crude Oils Importer to the US¶
- Unit measurement:
Thousand Barrels
crude_oils.groupby('originName').agg(f.sum('quantity').alias('total_quantity')).orderBy('total_quantity', ascending=False).show(25)
+-------------------+--------------+ | originName|total_quantity| +-------------------+--------------+ | World| 13418377| | Non-OPEC| 11295804| | Canada| 8051193| | Canada (Region)| 7937144| | Other Americas| 2902756| | OPEC| 2118304| | Middle East| 1516824| | Mexico| 1347899| | Saudi Arabia| 982260| | Africa| 654750| | Colombia| 495460| | Iraq| 493918| | Ecuador| 331919| | Brazil| 278898| | Nigeria| 266747| | Eurasia| 220633| | Venezuela| 185937| | Russia| 169785| | Europe| 165639| | Libya| 136668| | Guyana| 133373| | United Kingdom| 95560| | Argentina| 89782| |Trinidad and Tobago| 84291| | Ghana| 77096| +-------------------+--------------+ only showing top 25 rows
importer_df: 'DataFrame' = pd.DataFrame( [ item.asDict() for item in crude_oils.groupby('originName').agg(f.sum('quantity').alias('total_quantity')).orderBy('total_quantity', ascending=False).collect() ] )
countries, quantities = zip(*importer_df.iloc[7:29].head(10).set_index('originName').to_dict().get('total_quantity').items())
labels: List[str] = [ "%s: %s%%" % (item[0], round( (item[1]/importer_df['total_quantity'].sum()) *100,2) ) for item in zip(countries, quantities) ]
regions_df: 'DataFrame' = importer_df[ importer_df['originName'].isin(['World', 'Non-OPEC', 'Other Americas', 'OPEC', 'Middle East'] )]
regions_df['labels'] = regions_df.apply(lambda row: "%s: %s %%" % (row['originName'] ,round(row['total_quantity']/importer_df['total_quantity'].sum() ,2 ) * 100), axis=1 )
countries, percentage = zip( *regions_df.set_index('labels')['total_quantity'].to_dict().items())
fig = plt.figure(figsize=(12,5))
ax = fig.add_subplot(1,2,1)
ax.pie(x=list(percentage), labels=list(countries), colors=sns.color_palette(palette='flare', n_colors=10).as_hex() )
ax.set_title("Top Crude Oils Importer to the US", fontweight='bold', fontsize=14)
plt.gcf().gca().add_artist(plt.Circle(xy=(0,0), color='white', radius=0.45))
plt.tight_layout()
ax = fig.add_subplot(1,2,2)
ax.pie(x=list(quantities), labels=labels, colors=sns.color_palette(palette='flare', n_colors=10).as_hex() )
ax.set_title("Top 10 Crude Oils Importer Countries to the US", fontweight='bold', fontsize=14)
plt.gcf().gca().add_artist(plt.Circle(xy=(0,0), color='white', radius=0.45))
plt.tight_layout()
Crude Oils import from the Middle East to the US¶
crude_oils.filter("originName == 'Middle East' ").groupby('gradename').count().orderBy('count', ascending=False).show() # type of crude oils imported to the US from the Middle East
+-----------+-----+ | gradename|count| +-----------+-----+ | Medium| 69| | Heavy Sour| 69| | Light Sour| 64| |Light Sweet| 9| |Heavy Sweet| 3| +-----------+-----+
me_imports: 'DataFrame' = pd.DataFrame( [item.asDict() for item in crude_oils.filter("originName == 'Middle East' ").select('start_date', 'quantity', 'gradeName' ).collect() ] )
me_imports['start_date'] = pd.to_datetime(me_imports['start_date'] )
me_imports = me_imports.rename({'start_date': 'period'}, axis=1)
fig = plt.figure(figsize=(20,10))
for index,grade_name in enumerate( me_imports['gradeName'].unique().tolist() , 1) :
ax = fig.add_subplot(3,2,index)
me_imports.set_index('period').query(f"gradeName == '{grade_name}'")['quantity'].plot(ax=ax)
ax.set_title(f"Crude Oils {grade_name} import from the Middle East", fontweight='bold', fontsize=14)
ax.set_ylabel("Thousand Barrels", fontweight='bold')
plt.tight_layout()
Petroleum Weekly Stocks¶
query: str = """
/*
DESCRIPTION
-----------
- Motor Gasoline related (i.e., Reformulated, Finished, Conventional)
*/
SELECT
p.period,
MONTH(p.period) AS `month`,
p.area_name,
p.product_name,
p.process_name,
p.series,
p.series_description,
p.value,
p.units,
CASE
WHEN DATE_FORMAT( CONCAT('2024', '-', MONTH(p.period), '-', DAY(p.period)), '%Y-%m-%d' ) BETWEEN DATE_FORMAT("2024-06-20", '%Y-%m-%d' ) AND DATE_FORMAT("2024-09-22", '%Y-%m-%d' ) THEN 'Summer'
WHEN DATE_FORMAT( CONCAT('2024', '-', MONTH(p.period), '-', DAY(p.period)), '%Y-%m-%d' ) BETWEEN DATE_FORMAT("2024-09-22", '%Y-%m-%d' ) AND DATE_FORMAT("2024-12-20", '%Y-%m-%d' ) THEN 'Fall'
WHEN DATE_FORMAT( CONCAT('2024', '-', MONTH(p.period), '-', DAY(p.period)), '%Y-%m-%d' ) BETWEEN DATE_FORMAT("2024-12-21", '%Y-%m-%d' ) AND DATE_FORMAT("2025-03-20", '%Y-%m-%d' ) THEN 'Winter'
WHEN DATE_FORMAT( CONCAT('2024', '-', MONTH(p.period), '-', DAY(p.period)), '%Y-%m-%d' ) BETWEEN DATE_FORMAT("2024-03-21", '%Y-%m-%d' ) AND DATE_FORMAT("2024-06-19", '%Y-%m-%d' ) THEN 'Spring'
ELSE 'Winter'
END AS `season`
FROM petroleum_weekly_stocks p
WHERE p.series_description LIKE '%motor%' AND p.area_name LIKE '%PADD 1%'
"""
petrol_weekly_stocks: 'DataFrame' = ODINSpark(app_name="Weekly Petroleum Stocks").exec_odin_mysql_query(query=query)
# exclude conv_motor_gas with alcohol/ finished_motor > 55
petrol_weekly_stocks = petrol_weekly_stocks.filter("product_name != 'Conventional Motor Gasoline with Alcohol' ").filter("product_name != 'Finished Motor Gasoline Conventional >55' ")
DEBUG:odin_eia_db.engine.odin_spark: ..` ./sdmds:` +dmmds+-` -+ymmmmmmmmdo:` ./osyyy/` `ommmmmmmmdhso:` ./shmmmmmmmmmmmmmmds/. ./ohmmmdmdmm/ +mmmmmmmmmmmmmmmhy+//:://///++shdmmmmmmmmmmhsdmmmmmmmmmddyso+/::--.-://+syhdmmmmmmmmmmmmh` :dmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm+ .hdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmsooodmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd. ommmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmh+osoymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm/ :dmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+oohmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmms `hdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdsoo+ososmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd. /dmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmyoo++ooooymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmo `ydmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmhoooooooooodmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd` :mmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+o+ooooosmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm/ ommmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmy+o+o+o+o+oooymmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmy ydmmmdddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmhooo+ooo+oo+ooodmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd. .dmmmddddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+oo+o+ooo+oosmmmmmmmmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmd: -dmmmmddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmsoo+oooo+oo+oooo+hmmmmmmmmmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmdo +dmmmmdmmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmh+oo+oooooooooooooodmmmmdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdy sdmmmmddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdooo+ooooo+o+oooooo+smmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdh ymmmmddsoosssssyyyyyyyhhhhhhhhhhhhhhhhhs+o+o++o+o++oooooooo+hhhddddddddddddddmdmmmddmdmdmdmdmmmmmmmd hmmmmdddyo+++++++++ooo+++o+oooooooooooooo+oosyyyhhhddhdddddyssssoooooooooooooooooooosoooooooymmmmmmd hmmmmddmmdhso++++++++o++o+++ooo++++o+++ohhdddmddddddddddddddddddhy+++o++oo+oooooooooo+++++shdmmmmmmh ymmmmdddmmmmdyso++++o+o+ooooooooooo+oshdmddddddddmddddddddddddhyso+oooooo++o++o+o++o++oshdmmmmmmmmdh ydmmmddmmmmmdmmdyso+o++oooooooo+++o+ddddddmddddmddddddddddddddddhyo+oo+o+++o+++++++oshddmmmmmmmmmmdy odmmmdddmdmddmmmmmhyoo++o+o+ooo+o+osddmmdddddddddddddddddddddddhyso++o++++++++++ooydmmmmmdmmmmmmmmdo /dmmmdddmddmdmmmmmmmdysoo++++oo+o+oddddmdddddddddddddddddddddddddyo++++o++++++oyddmmmmmmmmmmmmmmmmd/ .dmmmmdddddmddmmmmmmmmmdyso+++o+ooyddmmdmddddddddmddddddddddddddysoooo++++ooyhdmmdmmmmmmmmmmmmmmmdd` `ydmmmdddddmmddmmmmmmmmmmdhso++++sdddmmmmdddddddmmddddddddddddmddyo+++++oyhdmmmmmmmmmmmmmmmmmmmmmds +dmmmdddddddddmmmmmmmmmmmmmdysoohdmdmmmmmdddddddmmdddddddddddddhyo++oyhdmmmmmmmmmmmmmmmmmmmmmmmmd: -dmmmddddddddmmmmmmmmmmmmmmmmdhooyddmdmmmddddmdddddddddddddddddy+oyhdmmmmmmmmmmmmmmmmmmmmmdmmmmdy` `ydmmmdddddddddddmmmmmmmmmmmmmyoohddddmmmdddmmmddmddmdddddddddds+sdmmmmmmmmmmmmmmmmmmmmmmmmmmmmd/ :dmmmddddmddddddmmmmmmmmmmmddoo+yddddddmdmmmmmdddddddddddddddddy+ymmmmmmdmmmmmdmmmmmmmmmmmmmmdh` sdmmmddddddmddmmmmmmmmmmmmdso+oosyyyydddmmddddddddddddddddddyos+odmmmmmddmmmmmmmmdmmmmmmmmmmdo -dmmmdddddddmdmmmmmmmmdmmmyoo++o+o++oyddmdddddddddddddddddddh+o++sdmmmmdmmmmmmmmmmmmmmmmmmmmy` odmmdddddddmmmmmmmmmmddmhoooooo+oo++ydddddhhyosssdddddddddso+++o+hmdmmmmmmmmmmmmdmmdmmmmmmd- `hdmmmdddddmdmmmdmmmmdddoo++oo+++o++hddhso+++++++oydddddddyo+++o+oddmmmmmmdmmmmmmmmmmmmmmdo -hdmmdddddmdddddmddmmdy++++oo++++o+hys++ooshyso+++yddddyso++++o++ydmmmmmmmmmdmmmmddmmmmdy` /ddmdddddddmdddmmmmmh+o+++oo+++++++++oshdddddhyoooyhddy++++++++++ddmdmmmdmdmmmmmddmmmdh- `sdmmmdddddddmdmmmmdo++++oo++++++++oydddmdddddddhyoosyo+++++o++o+ydddddddmdmmmmmmmmmmd: `yddmmdddddmmmmmmms++++o++++++ooyddddmmdmdmdddmdddys+++++++o+++oodddmddddddmmddmmmdd+ -ydmmmddddddddmmh++++++o++oshddddddmmmdddddddddddddhyoo+++o+++o+sdddmddmmmddddmmmd+ .hdmmmdmddddddds++++++osydmmdddddmmddddddddddddddddddhso++++++oohdmmmddddddmmmddo` -ydmmmmdddddds++++ooyhdmmdmmmddddmdddmdmddddddmddddddddyso+++++odddmddddddmmddo` .ydmmmddddddoo+oyhdmmmmmdmmmddddddddmdddmddddddddddddddddyo++++sddmdddddmmdd+` `sdmmmmmdds+shddmdmmmmmmddmdddddddddddddddddddddddddddddddhs+++hdddddmmmdd+` `+ddmmmddyhddmmmmmmmmmdddmdddddddddddddddddddddddddddddddmdhsosddddmmmdh/ :hdmmmddddddmmmmdmmmddddddddddddddddddddddddddddddddddddddddddddmmmdh- .sdmmmdddmmmmmmmdmdmmddddddddddddddddddddddddddddddddddddddddmmmddy. /hdmmmmmmmmmddddddmddddddddddddddddddddddddddddddddddddddmmmmdh/` .sddmmmmddddddddddddddddddddddddddddddddddddddddddddddddddddo. /hdmdmmdmddddddmdmddddddddddddddddddddddddddddddddddddddo- .odddmdmddddddmdmddddddddddddddddmdddddddddddddmdddddo- .+hddmdmddddmdddddddddddddddddddddddddddddddddddh+. `/yddddmddddmddddddddddddddddddmddddddddddddy/` `:sddddddddddddddddddddddddmddddddddmddds: ./ydddddddddddddddddddddddddddddddy/` -+hdddddddddddddddddddddddddy+. `-ohdddddddddddddddddddy/. -+yddddddddddddyo-` .:oyddddhs/. ▄▄▄▄ ▄▄▄▄▄ ▄▄▄▄▄▄ ▄▄▄ ▄▄ ▄▄▄▄ ▄▄▄▄▄▄ ▄▄ ▄▄▄▄▄▄ ▄▄ ▄▄▄ ██▀▀██ ██▀▀▀██ ▀▀██▀▀ ███ ██ ▄█▀▀▀▀█ ██▀▀▀▀█▄ ████ ██▀▀▀▀██ ██ ██▀ ██ ██ ██ ██ ██ ██▀█ ██ ██▄ ██ ██ ████ ██ ██ ██▄██ ██ ██ ██ ██ ██ ██ ██ ██ ▀████▄ ██████▀ ██ ██ ███████ █████ ██ ██ ██ ██ ██ ██ █▄██ █████ ▀██ ██ ██████ ██ ▀██▄ ██ ██▄ ██▄▄██ ██▄▄▄██ ▄▄██▄▄ ██ ███ █▄▄▄▄▄█▀ ██ ▄██ ██▄ ██ ██ ██ ██▄ ▀▀▀▀ ▀▀▀▀▀ ▀▀▀▀▀▀ ▀▀ ▀▀▀ ▀▀▀▀▀ ▀▀ ▀▀ ▀▀ ▀▀ ▀▀▀ ▀▀ ▀▀ [ OK ] Launching ODIN @2024-08-20T17:29:48.504807 DEBUG:odin_eia_db.engine.odin_spark:[ OK ] Loading Spark Configuration For ODIN-Project... DEBUG:odin_eia_db.engine.odin_spark:[ OK ] Loading Hadoop Configuration For ODIN-Project...
petrol_weekly_stocks.withColumn('month_name', get_month_name('month') ).select('month_name').distinct().show()
+----------+ |month_name| +----------+ | July| | November| | October| | May| | August| | June| | December| | September| +----------+
petrol_weekly_stocks.printSchema()
root |-- period: string (nullable = true) |-- month: long (nullable = true) |-- area_name: string (nullable = true) |-- product_name: string (nullable = true) |-- process_name: string (nullable = true) |-- series: string (nullable = true) |-- series_description: string (nullable = true) |-- value: double (nullable = true) |-- units: string (nullable = true) |-- season: string (nullable = true)
petrol_weekly_stocks.show(5)
+----------+-----+---------+--------------------+-------------+--------------------+--------------------+-------+-----+------+ | period|month|area_name| product_name| process_name| series| series_description| value|units|season| +----------+-----+---------+--------------------+-------------+--------------------+--------------------+-------+-----+------+ |2023-12-22| 12| PADD 1|Reformulated Moto...|Ending Stocks| WG1ST_R10_1|East Coast (PADD ...| 4.0| MBBL|Winter| |2023-12-22| 12| PADD 1|Finished Motor Ga...|Ending Stocks|W_EPM0F_SAE_R10_MBBL|East Coast (PADD ...| 2405.0| MBBL|Winter| |2023-12-22| 12| PADD 1|Other Conventiona...|Ending Stocks| WG6ST_R10_1|East Coast (PADD ...| 2401.0| MBBL|Winter| |2023-12-22| 12| PADD 1|Reformulated Moto...|Ending Stocks| WGRSTP11|East Coast (PADD ...| 4.0| MBBL|Winter| |2023-12-22| 12| PADD 1|Motor Gasoline Bl...|Ending Stocks|W_EPOBGRR_SAE_R10...|East Coast (PADD ...|18867.0| MBBL|Winter| +----------+-----+---------+--------------------+-------------+--------------------+--------------------+-------+-----+------+ only showing top 5 rows
Conventional Motorl Gasoline Weekly Stocks (PADD 1
)¶
conv_motor: 'DataFrame' = pd.DataFrame( [ item.asDict() for item in petrol_weekly_stocks.filter(" product_name == 'Conventional Motor Gasoline' ").select('period', 'value', 'season').orderBy('period').collect() ] )
conv_motor['period'] = pd.to_datetime(conv_motor['period'])
conv_motor['month_name'] =conv_motor['period'].apply(lambda row: MONTH_LOOKUP.get( row.month ) )
conv_motor['month'] =conv_motor['period'].apply(lambda row: row.month )
fig = plt.figure(figsize=(16,4))
ax = fig.add_subplot(1,2,1)
conv_motor.set_index('period')['value'].plot(ax=ax, marker='s', mec='black')
ax.set_title("Conventional Motor Gasoline Weekly Stocks", fontweight='bold', fontsize=14)
ax.set_ylabel("MBBL")
ax = fig.add_subplot(1,2,2)
months, avg_vals = zip(*conv_motor.groupby('month')['value'].mean().to_dict().items())
ax.bar(x=list(months), height=list(avg_vals), hatch='//', ec='k')
ax.set_xticklabels(['May'] + conv_motor.sort_values(by='month')['month_name'].unique().tolist() , fontweight='bold' )
ax.set_title("Conventional Motor Gasoline Stocks by Month Name", fontweight='bold', fontsize=14)
ax.set_ylabel("MBBL")
for index,val in enumerate(avg_vals, 5):
ax.annotate(text=val, xy=(index - 0.3,val + 50))
plt.tight_layout()
Conventional Motorl Gasoline Stocks by Season¶
conv_motor.groupby('season')['value'].agg(['sum', 'mean']).applymap(lambda row: f"{row:.2f} MBBL" )
sum | mean | |
---|---|---|
season | ||
Fall | 38233.00 MBBL | 3186.08 MBBL |
Spring | 13953.00 MBBL | 3488.25 MBBL |
Summer | 42657.00 MBBL | 3046.93 MBBL |
Winter | 2401.00 MBBL | 2401.00 MBBL |
petrol_weekly_stocks.filter("product_name == 'Finished Motor Gasoline' ").select('period', 'value').orderBy('period').show(30)
+----------+------+ | period| value| +----------+------+ |2023-06-02|3442.0| |2023-06-09|3784.0| |2023-06-16|3936.0| |2023-06-23|3106.0| |2023-06-30|2751.0| |2023-07-07|3578.0| |2023-07-14|3193.0| |2023-07-21|2909.0| |2023-07-28|2849.0| |2023-08-04|2621.0| |2023-08-11|2529.0| |2023-08-18|3259.0| |2023-08-25|3642.0| |2023-09-01|3090.0| |2023-09-08|2800.0| |2023-09-15|3117.0| |2023-09-22|3276.0| |2023-09-29|3357.0| |2023-10-06|3007.0| |2023-10-13|3111.0| |2023-10-20|2904.0| |2023-10-27|3349.0| |2023-11-03|3534.0| |2023-11-10|3117.0| |2023-11-17|3273.0| |2023-11-24|3223.0| |2023-12-01|3048.0| |2023-12-08|3449.0| |2023-12-15|2908.0| |2023-12-22|2405.0| +----------+------+