import pandas as pd
import xlrd
import numpy as np
import seaborn as sns
import chart_studio.plotly as py
import cufflinks as cf
import plotly.io as pio
from random import randint
import plotly.graph_objs as go
import chart_studio.plotly as py
import plotly.express as px
from plotly import __version__
from plotly.offline import download_plotlyjs,init_notebook_mode, plot, iplot
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
df = pd.read_excel('Oil_Production_-_After_1980_states.xls',sheet_name="Data 1")
# df.columns = df.iloc[1]
df.shape
(499, 33)
df.head()
| Date | FL | NY | PA | VA | WV | IL | IA | KS | KY | ... | TX | CO | ID | MT | UT | WY | AK | AZ | CA | NV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1981-01-15 | 3606 | 71 | 317 | 1 | 248 | 2000 | 438 | 5151 | 486 | ... | 80609 | 2506 | NaN | 2416 | 2150 | 11089 | 49789 | 28 | 30297 | 69 |
| 1 | 1981-02-15 | 3162 | 68 | 286 | 1 | 234 | 1775 | 374 | 4727 | 465 | ... | 72591 | 2255 | NaN | 2342 | 2170 | 10016 | 45346 | 24 | 27455 | 56 |
| 2 | 1981-03-15 | 3167 | 71 | 317 | 1 | 316 | 1960 | 435 | 5546 | 531 | ... | 80734 | 2527 | NaN | 2592 | 2150 | 11089 | 50150 | 23 | 30515 | 58 |
| 3 | 1981-04-15 | 3064 | 69 | 306 | 1 | 241 | 2040 | 402 | 5524 | 555 | ... | 77344 | 2478 | NaN | 2501 | 2160 | 10731 | 48235 | 28 | 29540 | 57 |
| 4 | 1981-05-15 | 3052 | 71 | 317 | 1 | 294 | 2095 | 400 | 5400 | 530 | ... | 79890 | 2498 | NaN | 2626 | 2150 | 11089 | 48977 | 30 | 31203 | 56 |
5 rows × 33 columns
df.describe
<bound method NDFrame.describe of Date FL NY PA VA WV IL IA KS KY ... TX \
0 1981-01-15 3606 71 317 1 248 2000 438 5151 486 ... 80609
1 1981-02-15 3162 68 286 1 234 1775 374 4727 465 ... 72591
2 1981-03-15 3167 71 317 1 316 1960 435 5546 531 ... 80734
3 1981-04-15 3064 69 306 1 241 2040 402 5524 555 ... 77344
4 1981-05-15 3052 71 317 1 294 2095 400 5400 530 ... 79890
.. ... ... .. ... .. ... ... ... ... ... ... ...
494 2022-03-15 122 12 485 1 1523 625 136 2402 303 ... 154266
495 2022-04-15 122 22 417 0 1402 578 140 2394 165 ... 150468
496 2022-05-15 124 23 423 0 1357 593 146 2465 137 ... 153951
497 2022-06-15 67 22 408 0 1238 590 169 2421 109 ... 148874
498 2022-07-15 106 23 398 0 1406 553 137 2518 395 ... 155149
CO ID MT UT WY AK AZ CA NV
0 2506 NaN 2416 2150 11089 49789 28 30297 69
1 2255 NaN 2342 2170 10016 45346 24 27455 56
2 2527 NaN 2592 2150 11089 50150 23 30515 58
3 2478 NaN 2501 2160 10731 48235 28 29540 57
4 2498 NaN 2626 2150 11089 48977 30 31203 56
.. ... ... ... ... ... ... .. ... ..
494 13618 2.0 1637 3543 7484 13635 0 10676 20
495 13196 3.0 1575 3461 7319 13246 1 10293 16
496 13565 2.0 1655 3777 7752 13865 0 10509 18
497 12977 4.0 1900 3635 7288 12559 0 10165 22
498 13010 4.0 1862 3754 7615 13379 0 10352 22
[499 rows x 33 columns]>
df.info
<bound method DataFrame.info of Date FL NY PA VA WV IL IA KS KY ... TX \
0 1981-01-15 3606 71 317 1 248 2000 438 5151 486 ... 80609
1 1981-02-15 3162 68 286 1 234 1775 374 4727 465 ... 72591
2 1981-03-15 3167 71 317 1 316 1960 435 5546 531 ... 80734
3 1981-04-15 3064 69 306 1 241 2040 402 5524 555 ... 77344
4 1981-05-15 3052 71 317 1 294 2095 400 5400 530 ... 79890
.. ... ... .. ... .. ... ... ... ... ... ... ...
494 2022-03-15 122 12 485 1 1523 625 136 2402 303 ... 154266
495 2022-04-15 122 22 417 0 1402 578 140 2394 165 ... 150468
496 2022-05-15 124 23 423 0 1357 593 146 2465 137 ... 153951
497 2022-06-15 67 22 408 0 1238 590 169 2421 109 ... 148874
498 2022-07-15 106 23 398 0 1406 553 137 2518 395 ... 155149
CO ID MT UT WY AK AZ CA NV
0 2506 NaN 2416 2150 11089 49789 28 30297 69
1 2255 NaN 2342 2170 10016 45346 24 27455 56
2 2527 NaN 2592 2150 11089 50150 23 30515 58
3 2478 NaN 2501 2160 10731 48235 28 29540 57
4 2498 NaN 2626 2150 11089 48977 30 31203 56
.. ... ... ... ... ... ... .. ... ..
494 13618 2.0 1637 3543 7484 13635 0 10676 20
495 13196 3.0 1575 3461 7319 13246 1 10293 16
496 13565 2.0 1655 3777 7752 13865 0 10509 18
497 12977 4.0 1900 3635 7288 12559 0 10165 22
498 13010 4.0 1862 3754 7615 13379 0 10352 22
[499 rows x 33 columns]>
df.head()
| Date | FL | NY | PA | VA | WV | IL | IA | KS | KY | ... | TX | CO | ID | MT | UT | WY | AK | AZ | CA | NV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1981-01-15 | 3606 | 71 | 317 | 1 | 248 | 2000 | 438 | 5151 | 486 | ... | 80609 | 2506 | NaN | 2416 | 2150 | 11089 | 49789 | 28 | 30297 | 69 |
| 1 | 1981-02-15 | 3162 | 68 | 286 | 1 | 234 | 1775 | 374 | 4727 | 465 | ... | 72591 | 2255 | NaN | 2342 | 2170 | 10016 | 45346 | 24 | 27455 | 56 |
| 2 | 1981-03-15 | 3167 | 71 | 317 | 1 | 316 | 1960 | 435 | 5546 | 531 | ... | 80734 | 2527 | NaN | 2592 | 2150 | 11089 | 50150 | 23 | 30515 | 58 |
| 3 | 1981-04-15 | 3064 | 69 | 306 | 1 | 241 | 2040 | 402 | 5524 | 555 | ... | 77344 | 2478 | NaN | 2501 | 2160 | 10731 | 48235 | 28 | 29540 | 57 |
| 4 | 1981-05-15 | 3052 | 71 | 317 | 1 | 294 | 2095 | 400 | 5400 | 530 | ... | 79890 | 2498 | NaN | 2626 | 2150 | 11089 | 48977 | 30 | 31203 | 56 |
5 rows × 33 columns
df_dt = df.groupby(['Date', 'FL'])
# print(df_dt)
dates = df.Date
states = df.columns
dates[:10]
0 1981-01-15 1 1981-02-15 2 1981-03-15 3 1981-04-15 4 1981-05-15 5 1981-06-15 6 1981-07-15 7 1981-08-15 8 1981-09-15 9 1981-10-15 Name: Date, dtype: datetime64[ns]
df.drop('Date', axis = 1, inplace = True)
colist = []
for col in df.columns:
colist.append(df[col])
len(colist)
32
dataa = pd.concat(colist, axis = 0)
type(dataa)
pandas.core.series.Series
# for item in states:
# print(item)
len(states) - 1
32
len(dataa)
15968
print(dataa)
0 3606.0
1 3162.0
2 3167.0
3 3064.0
4 3052.0
...
494 20.0
495 16.0
496 18.0
497 22.0
498 22.0
Length: 15968, dtype: float64
print(dataa[:10])
0 3606.0 1 3162.0 2 3167.0 3 3064.0 4 3052.0 5 2856.0 6 2833.0 7 2650.0 8 2612.0 9 2709.0 dtype: float64
print(dates)
0 1981-01-15
1 1981-02-15
2 1981-03-15
3 1981-04-15
4 1981-05-15
...
494 2022-03-15
495 2022-04-15
496 2022-05-15
497 2022-06-15
498 2022-07-15
Name: Date, Length: 499, dtype: datetime64[ns]
states_list = []
dates_list = []
for i in range(0,15968, 499):
x = (i//499) + 1
for k in range(499):
states_list.append(states[x])
for i in range(32):
for i in range(0, 499):
dates_list.append(dates[i])
# dates_list
st_df = pd.DataFrame([states_list,dates_list, dataa])
st_df = st_df.transpose()
st_df.to_excel('stacked_states.xlsx')
st_df.head()
| 0 | 1 | 2 | |
|---|---|---|---|
| 0 | FL | 1981-01-15 | 3606.0 |
| 1 | FL | 1981-02-15 | 3162.0 |
| 2 | FL | 1981-03-15 | 3167.0 |
| 3 | FL | 1981-04-15 | 3064.0 |
| 4 | FL | 1981-05-15 | 3052.0 |
df.head()
| FL | NY | PA | VA | WV | IL | IA | KS | KY | MI | ... | TX | CO | ID | MT | UT | WY | AK | AZ | CA | NV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3606 | 71 | 317 | 1 | 248 | 2000 | 438 | 5151 | 486 | 2827 | ... | 80609 | 2506 | NaN | 2416 | 2150 | 11089 | 49789 | 28 | 30297 | 69 |
| 1 | 3162 | 68 | 286 | 1 | 234 | 1775 | 374 | 4727 | 465 | 2493 | ... | 72591 | 2255 | NaN | 2342 | 2170 | 10016 | 45346 | 24 | 27455 | 56 |
| 2 | 3167 | 71 | 317 | 1 | 316 | 1960 | 435 | 5546 | 531 | 2807 | ... | 80734 | 2527 | NaN | 2592 | 2150 | 11089 | 50150 | 23 | 30515 | 58 |
| 3 | 3064 | 69 | 306 | 1 | 241 | 2040 | 402 | 5524 | 555 | 2720 | ... | 77344 | 2478 | NaN | 2501 | 2160 | 10731 | 48235 | 28 | 29540 | 57 |
| 4 | 3052 | 71 | 317 | 1 | 294 | 2095 | 400 | 5400 | 530 | 2763 | ... | 79890 | 2498 | NaN | 2626 | 2150 | 11089 | 48977 | 30 | 31203 | 56 |
5 rows × 32 columns
df.index = dates_list[:499]
init_notebook_mode(connected=True)
# Allows us to use cufflinks offline
cf.go_offline()
l_plots = df.iplot(xTitle = 'Years ', yTitle = 'Oil Production(in thousands of barrels)')
plt.show()
plt.savefig('line_plots.png')
<Figure size 432x288 with 0 Axes>
# pio.write_image(l_plots, 'oil_lineplots.svg')
df.iplot(kind = 'surface',xTitle = 'Years',yTitle = 'States', zTitle = 'Oil Production (in thousands of barrels)')
df.iplot(kind='box')
color = ['royalblue', 'green', 'mediumspringgreen', 'orange', 'red', 'yellow'
, 'cyan', 'deeppink', 'magenta', 'indigo', 'deepskyblue', 'gold'
, 'darkgoldenrod', 'lightcoral', 'turquoise', 'lime', 'purple',
'olive', 'brown', 'black', 'chocolate', 'teal', 'tan', 'beige',
'steelblue', 'slategrey', 'orchid', 'crimson', 'palevioletred', 'lawngreen',
'orangered', 'darkslategrey']
df.iplot(kind = 'bar', color = color,xTitle = 'Years', yTitle = 'Oil Production (in thousands of barrels)')
sns.heatmap(df)
<AxesSubplot:>
nona_df = df.dropna()
tx = nona_df.TX
# nona_df.drop(['TX'], axis = 1, inplace = True)
x_train , x_test , y_train , y_test =train_test_split(nona_df,tx, test_size = 0.3)
train = tx[tx.index < pd.to_datetime("2020-11-01", format='%Y-%m-%d')]
test = tx[tx.index > pd.to_datetime("2020-11-01", format='%Y-%m-%d')]
nona_df = df.dropna()
lin_mdl = LinearRegression(normalize=True)
lin_mdl.fit(x_train, y_train)
LinearRegression(normalize=True)
preds = lin_mdl.predict(x_test)
new_dates_list = tx[tx.index > pd.to_datetime("2017-11-01", format='%Y-%m-%d')]
preds_srs = pd.DataFrame(preds)
tx.iplot()
preds_srs.iplot()
st_avgs = dict()
for item in df.columns:
st_avgs[item] = np.mean(df[item])
st_avgs_std = {k: v for k, v in sorted(st_avgs.items(), key=lambda item: item[1], reverse = True)}
st_avgs_std
{'TX': 65848.5751503006,
'AK': 33534.19839679359,
'CA': 21725.088176352707,
'ND': 11241.905811623246,
'OK': 9501.649298597195,
'NM': 9268.875751503007,
'LA': 9044.10621242485,
'WY': 6797.761523046092,
'CO': 4414.492985971944,
'KS': 3786.008016032064,
'UT': 2198.9498997995993,
'MT': 1955.6673346693387,
'MS': 1900.8957915831663,
'IL': 1230.314629258517,
'AL': 1048.2064128256513,
'MI': 1040.4008016032064,
'OH': 942.3847695390782,
'AR': 757.4248496993988,
'FL': 496.2805611222445,
'WV': 385.0741482965932,
'KY': 320.57715430861725,
'NE': 312.8557114228457,
'PA': 299.0681362725451,
'IA': 221.6813627254509,
'SD': 118.13426853707415,
'NV': 95.72945891783567,
'TN': 35.210420841683366,
'NY': 32.362725450901806,
'MO': 10.90380761523046,
'AZ': 7.170340681362726,
'ID': 2.609625668449198,
'VA': 1.312625250501002}
ys = [float(i) for i in st_avgs_std.values()]
xs = list(st_avgs_std.keys())
fig = px.bar(x = st_avgs_std.keys(), y = ys, color = xs,
title = 'States Average Monthly Production (in thousands of barrels)')
fig.show()
df.head()
| FL | NY | PA | VA | WV | IL | IA | KS | KY | MI | ... | TX | CO | ID | MT | UT | WY | AK | AZ | CA | NV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1981-01-15 | 3606 | 71 | 317 | 1 | 248 | 2000 | 438 | 5151 | 486 | 2827 | ... | 80609 | 2506 | NaN | 2416 | 2150 | 11089 | 49789 | 28 | 30297 | 69 |
| 1981-02-15 | 3162 | 68 | 286 | 1 | 234 | 1775 | 374 | 4727 | 465 | 2493 | ... | 72591 | 2255 | NaN | 2342 | 2170 | 10016 | 45346 | 24 | 27455 | 56 |
| 1981-03-15 | 3167 | 71 | 317 | 1 | 316 | 1960 | 435 | 5546 | 531 | 2807 | ... | 80734 | 2527 | NaN | 2592 | 2150 | 11089 | 50150 | 23 | 30515 | 58 |
| 1981-04-15 | 3064 | 69 | 306 | 1 | 241 | 2040 | 402 | 5524 | 555 | 2720 | ... | 77344 | 2478 | NaN | 2501 | 2160 | 10731 | 48235 | 28 | 29540 | 57 |
| 1981-05-15 | 3052 | 71 | 317 | 1 | 294 | 2095 | 400 | 5400 | 530 | 2763 | ... | 79890 | 2498 | NaN | 2626 | 2150 | 11089 | 48977 | 30 | 31203 | 56 |
5 rows × 32 columns
df['mean'] = np.mean(df, axis = 1)
df['mean'].iplot(xTitle = 'Years', yTitle = 'Total States Oil Production (in thousands of barrels)')