import pandas as pd
import xlrd
import numpy as np
import seaborn as sns
import chart_studio.plotly as py
import cufflinks as cf
import plotly.io as pio
from random import randint
import plotly.graph_objs as go 
import chart_studio.plotly as py
import plotly.express as px
from plotly import __version__
from plotly.offline import download_plotlyjs,init_notebook_mode, plot, iplot
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


df = pd.read_excel('Oil_Production_-_After_1980_states.xls',sheet_name="Data 1")


# df.columns = df.iloc[1]


df.shape

(499, 33)


df.head()


df.describe

<bound method NDFrame.describe of           Date    FL  NY   PA  VA    WV    IL   IA    KS   KY  ...      TX  \
0   1981-01-15  3606  71  317   1   248  2000  438  5151  486  ...   80609   
1   1981-02-15  3162  68  286   1   234  1775  374  4727  465  ...   72591   
2   1981-03-15  3167  71  317   1   316  1960  435  5546  531  ...   80734   
3   1981-04-15  3064  69  306   1   241  2040  402  5524  555  ...   77344   
4   1981-05-15  3052  71  317   1   294  2095  400  5400  530  ...   79890   
..         ...   ...  ..  ...  ..   ...   ...  ...   ...  ...  ...     ...   
494 2022-03-15   122  12  485   1  1523   625  136  2402  303  ...  154266   
495 2022-04-15   122  22  417   0  1402   578  140  2394  165  ...  150468   
496 2022-05-15   124  23  423   0  1357   593  146  2465  137  ...  153951   
497 2022-06-15    67  22  408   0  1238   590  169  2421  109  ...  148874   
498 2022-07-15   106  23  398   0  1406   553  137  2518  395  ...  155149   

        CO   ID    MT    UT     WY     AK  AZ     CA  NV  
0     2506  NaN  2416  2150  11089  49789  28  30297  69  
1     2255  NaN  2342  2170  10016  45346  24  27455  56  
2     2527  NaN  2592  2150  11089  50150  23  30515  58  
3     2478  NaN  2501  2160  10731  48235  28  29540  57  
4     2498  NaN  2626  2150  11089  48977  30  31203  56  
..     ...  ...   ...   ...    ...    ...  ..    ...  ..  
494  13618  2.0  1637  3543   7484  13635   0  10676  20  
495  13196  3.0  1575  3461   7319  13246   1  10293  16  
496  13565  2.0  1655  3777   7752  13865   0  10509  18  
497  12977  4.0  1900  3635   7288  12559   0  10165  22  
498  13010  4.0  1862  3754   7615  13379   0  10352  22  

[499 rows x 33 columns]>


df.info

<bound method DataFrame.info of           Date    FL  NY   PA  VA    WV    IL   IA    KS   KY  ...      TX  \
0   1981-01-15  3606  71  317   1   248  2000  438  5151  486  ...   80609   
1   1981-02-15  3162  68  286   1   234  1775  374  4727  465  ...   72591   
2   1981-03-15  3167  71  317   1   316  1960  435  5546  531  ...   80734   
3   1981-04-15  3064  69  306   1   241  2040  402  5524  555  ...   77344   
4   1981-05-15  3052  71  317   1   294  2095  400  5400  530  ...   79890   
..         ...   ...  ..  ...  ..   ...   ...  ...   ...  ...  ...     ...   
494 2022-03-15   122  12  485   1  1523   625  136  2402  303  ...  154266   
495 2022-04-15   122  22  417   0  1402   578  140  2394  165  ...  150468   
496 2022-05-15   124  23  423   0  1357   593  146  2465  137  ...  153951   
497 2022-06-15    67  22  408   0  1238   590  169  2421  109  ...  148874   
498 2022-07-15   106  23  398   0  1406   553  137  2518  395  ...  155149   

        CO   ID    MT    UT     WY     AK  AZ     CA  NV  
0     2506  NaN  2416  2150  11089  49789  28  30297  69  
1     2255  NaN  2342  2170  10016  45346  24  27455  56  
2     2527  NaN  2592  2150  11089  50150  23  30515  58  
3     2478  NaN  2501  2160  10731  48235  28  29540  57  
4     2498  NaN  2626  2150  11089  48977  30  31203  56  
..     ...  ...   ...   ...    ...    ...  ..    ...  ..  
494  13618  2.0  1637  3543   7484  13635   0  10676  20  
495  13196  3.0  1575  3461   7319  13246   1  10293  16  
496  13565  2.0  1655  3777   7752  13865   0  10509  18  
497  12977  4.0  1900  3635   7288  12559   0  10165  22  
498  13010  4.0  1862  3754   7615  13379   0  10352  22  

[499 rows x 33 columns]>


df.head()


df_dt = df.groupby(['Date', 'FL'])


# print(df_dt)


dates = df.Date


states = df.columns


dates[:10]

0   1981-01-15
1   1981-02-15
2   1981-03-15
3   1981-04-15
4   1981-05-15
5   1981-06-15
6   1981-07-15
7   1981-08-15
8   1981-09-15
9   1981-10-15
Name: Date, dtype: datetime64[ns]


df.drop('Date', axis = 1, inplace = True)


colist = []
for col in df.columns:
    colist.append(df[col])


len(colist)

32


dataa = pd.concat(colist, axis = 0)


type(dataa)

pandas.core.series.Series


# for item in states:
#     print(item)


len(states) - 1

32


len(dataa)

15968


print(dataa)

0      3606.0
1      3162.0
2      3167.0
3      3064.0
4      3052.0
        ...  
494      20.0
495      16.0
496      18.0
497      22.0
498      22.0
Length: 15968, dtype: float64


print(dataa[:10])

0    3606.0
1    3162.0
2    3167.0
3    3064.0
4    3052.0
5    2856.0
6    2833.0
7    2650.0
8    2612.0
9    2709.0
dtype: float64


print(dates)

0     1981-01-15
1     1981-02-15
2     1981-03-15
3     1981-04-15
4     1981-05-15
         ...    
494   2022-03-15
495   2022-04-15
496   2022-05-15
497   2022-06-15
498   2022-07-15
Name: Date, Length: 499, dtype: datetime64[ns]


states_list = []
dates_list = []
for i in range(0,15968, 499):
    x = (i//499) + 1
    for k in range(499):
        states_list.append(states[x])
for i in range(32):
    for i in range(0, 499):
        dates_list.append(dates[i])


# dates_list


st_df = pd.DataFrame([states_list,dates_list, dataa])


st_df = st_df.transpose()


st_df.to_excel('stacked_states.xlsx')


st_df.head()


df.head()


df.index = dates_list[:499]


init_notebook_mode(connected=True)
# Allows us to use cufflinks offline
cf.go_offline()

l_plots = df.iplot(xTitle = 'Years ', yTitle = 'Oil Production(in thousands of barrels)')
plt.show()
plt.savefig('line_plots.png')

<Figure size 432x288 with 0 Axes>


# pio.write_image(l_plots, 'oil_lineplots.svg')


df.iplot(kind  = 'surface',xTitle = 'Years',yTitle = 'States', zTitle = 'Oil Production (in thousands of barrels)')


df.iplot(kind='box')


color = ['royalblue', 'green', 'mediumspringgreen', 'orange', 'red', 'yellow'
, 'cyan', 'deeppink', 'magenta', 'indigo', 'deepskyblue', 'gold'
, 'darkgoldenrod', 'lightcoral', 'turquoise', 'lime', 'purple', 
'olive', 'brown', 'black', 'chocolate', 'teal', 'tan', 'beige',
'steelblue', 'slategrey', 'orchid', 'crimson', 'palevioletred', 'lawngreen',
'orangered', 'darkslategrey']


df.iplot(kind = 'bar', color = color,xTitle = 'Years', yTitle = 'Oil Production (in thousands of barrels)')


sns.heatmap(df)

<AxesSubplot:>


nona_df =  df.dropna()


tx = nona_df.TX


# nona_df.drop(['TX'], axis = 1,  inplace = True)


x_train , x_test , y_train , y_test =train_test_split(nona_df,tx, test_size = 0.3)


train = tx[tx.index < pd.to_datetime("2020-11-01", format='%Y-%m-%d')]
test = tx[tx.index > pd.to_datetime("2020-11-01", format='%Y-%m-%d')]


nona_df =  df.dropna()


lin_mdl = LinearRegression(normalize=True)


lin_mdl.fit(x_train, y_train)

LinearRegression(normalize=True)


preds = lin_mdl.predict(x_test)


new_dates_list = tx[tx.index > pd.to_datetime("2017-11-01", format='%Y-%m-%d')]


preds_srs = pd.DataFrame(preds)


tx.iplot()


preds_srs.iplot()


st_avgs = dict()

for item in df.columns:
    st_avgs[item] = np.mean(df[item])


st_avgs_std = {k: v for k, v in sorted(st_avgs.items(), key=lambda item: item[1], reverse = True)}


st_avgs_std

{'TX': 65848.5751503006,
 'AK': 33534.19839679359,
 'CA': 21725.088176352707,
 'ND': 11241.905811623246,
 'OK': 9501.649298597195,
 'NM': 9268.875751503007,
 'LA': 9044.10621242485,
 'WY': 6797.761523046092,
 'CO': 4414.492985971944,
 'KS': 3786.008016032064,
 'UT': 2198.9498997995993,
 'MT': 1955.6673346693387,
 'MS': 1900.8957915831663,
 'IL': 1230.314629258517,
 'AL': 1048.2064128256513,
 'MI': 1040.4008016032064,
 'OH': 942.3847695390782,
 'AR': 757.4248496993988,
 'FL': 496.2805611222445,
 'WV': 385.0741482965932,
 'KY': 320.57715430861725,
 'NE': 312.8557114228457,
 'PA': 299.0681362725451,
 'IA': 221.6813627254509,
 'SD': 118.13426853707415,
 'NV': 95.72945891783567,
 'TN': 35.210420841683366,
 'NY': 32.362725450901806,
 'MO': 10.90380761523046,
 'AZ': 7.170340681362726,
 'ID': 2.609625668449198,
 'VA': 1.312625250501002}


ys =  [float(i) for i in st_avgs_std.values()]

xs = list(st_avgs_std.keys())


fig = px.bar(x = st_avgs_std.keys(), y = ys, color = xs,
             title = 'States Average Monthly Production (in thousands of barrels)')
fig.show()


df.head()


df['mean'] = np.mean(df, axis = 1)


df['mean'].iplot(xTitle = 'Years', yTitle = 'Total States Oil Production (in thousands of barrels)')

	Date	FL	NY	PA	VA	WV	IL	IA	KS	KY	...	TX	CO	ID	MT	UT	WY	AK	AZ	CA	NV
0	1981-01-15	3606	71	317	1	248	2000	438	5151	486	...	80609	2506	NaN	2416	2150	11089	49789	28	30297	69
1	1981-02-15	3162	68	286	1	234	1775	374	4727	465	...	72591	2255	NaN	2342	2170	10016	45346	24	27455	56
2	1981-03-15	3167	71	317	1	316	1960	435	5546	531	...	80734	2527	NaN	2592	2150	11089	50150	23	30515	58
3	1981-04-15	3064	69	306	1	241	2040	402	5524	555	...	77344	2478	NaN	2501	2160	10731	48235	28	29540	57
4	1981-05-15	3052	71	317	1	294	2095	400	5400	530	...	79890	2498	NaN	2626	2150	11089	48977	30	31203	56

	0	1	2
0	FL	1981-01-15	3606.0
1	FL	1981-02-15	3162.0
2	FL	1981-03-15	3167.0
3	FL	1981-04-15	3064.0
4	FL	1981-05-15	3052.0