import pandas as pdimport numpy as np
!ls sales*.xlsx
sales-feb-2014.xlsxsales-jan-2014.xlsxsales-mar-2014.xlsx
import glob
glob.glob('sales*.xlsx')
['sales-feb-2014.xlsx', 'sales-jan-2014.xlsx', 'sales-mar-2014.xlsx']
all_data = pd.DataFrame()for f in glob.glob('sales*.xlsx'): df = pd.read_excel(f) all_data = all_data.append(df, ignore_index=True)
all_data.head()
| account number | name | sku | quantity | unit price | ext price | date |
0 | 383080 | Will LLC | B1-20000 | 7 | 33.69 | 235.83 | 2014-02-01 09:04:59 |
1 | 412290 | Jerde-Hilpert | S1-27722 | 11 | 21.12 | 232.32 | 2014-02-01 11:51:46 |
2 | 412290 | Jerde-Hilpert | B1-86481 | 3 | 35.99 | 107.97 | 2014-02-01 17:24:32 |
3 | 412290 | Jerde-Hilpert | B1-20000 | 23 | 78.90 | 1814.70 | 2014-02-01 19:56:48 |
4 | 672390 | Kuhn-Gusikowski | S1-06532 | 48 | 55.82 | 2679.36 | 2014-02-02 03:45:20 |
all_data['date'] = pd.to_datetime(all_data['date'])all_data.head()
| account number | name | sku | quantity | unit price | ext price | date |
0 | 383080 | Will LLC | B1-20000 | 7 | 33.69 | 235.83 | 2014-02-01 09:04:59 |
1 | 412290 | Jerde-Hilpert | S1-27722 | 11 | 21.12 | 232.32 | 2014-02-01 11:51:46 |
2 | 412290 | Jerde-Hilpert | B1-86481 | 3 | 35.99 | 107.97 | 2014-02-01 17:24:32 |
3 | 412290 | Jerde-Hilpert | B1-20000 | 23 | 78.90 | 1814.70 | 2014-02-01 19:56:48 |
4 | 672390 | Kuhn-Gusikowski | S1-06532 | 48 | 55.82 | 2679.36 | 2014-02-02 03:45:20 |
status = pd.read_excel('customer-status.xlsx')status
| account number | name | status |
0 | 740150 | Barton LLC | gold |
1 | 714466 | Trantow-Barrows | silver |
2 | 218895 | Kulas Inc | bronze |
3 | 307599 | Kassulke, Ondricka and Metz | bronze |
4 | 412290 | Jerde-Hilpert | bronze |
5 | 729833 | Koepp Ltd | silver |
6 | 146832 | Kiehn-Spinka | silver |
7 | 688981 | Keeling LLC | silver |
8 | 786968 | Frami, Hills and Schmidt | silver |
9 | 239344 | Stokes LLC | gold |
10 | 672390 | Kuhn-Gusikowski | silver |
11 | 141962 | Herman LLC | gold |
12 | 424914 | White-Trantow | silver |
13 | 527099 | Sanford and Sons | bronze |
14 | 642753 | Pollich LLC | bronze |
15 | 257198 | Cronin, Oberbrunner and Spencer | gold |
all_data_st = pd.merge(all_data, status, how='left')all_data_st.head()
| account number | name | sku | quantity | unit price | ext price | date | status |
0 | 383080 | Will LLC | B1-20000 | 7 | 33.69 | 235.83 | 2014-02-01 09:04:59 | NaN |
1 | 412290 | Jerde-Hilpert | S1-27722 | 11 | 21.12 | 232.32 | 2014-02-01 11:51:46 | bronze |
2 | 412290 | Jerde-Hilpert | B1-86481 | 3 | 35.99 | 107.97 | 2014-02-01 17:24:32 | bronze |
3 | 412290 | Jerde-Hilpert | B1-20000 | 23 | 78.90 | 1814.70 | 2014-02-01 19:56:48 | bronze |
4 | 672390 | Kuhn-Gusikowski | S1-06532 | 48 | 55.82 | 2679.36 | 2014-02-02 03:45:20 | silver |
all_data_st[all_data_st['account number']==737550]
| account number | name | sku | quantity | unit price | ext price | date | status |
15 | 737550 | Fritsch, Russel and Anderson | S1-47412 | 40 | 51.01 | 2040.40 | 2014-02-05 01:20:40 | NaN |
25 | 737550 | Fritsch, Russel and Anderson | S1-06532 | 34 | 18.69 | 635.46 | 2014-02-07 09:22:02 | NaN |
66 | 737550 | Fritsch, Russel and Anderson | S1-27722 | 15 | 70.23 | 1053.45 | 2014-02-16 18:24:42 | NaN |
78 | 737550 | Fritsch, Russel and Anderson | S2-34077 | 26 | 93.35 | 2427.10 | 2014-02-20 18:45:43 | NaN |
80 | 737550 | Fritsch, Russel and Anderson | S1-93683 | 31 | 10.52 | 326.12 | 2014-02-21 13:55:45 | NaN |
91 | 737550 | Fritsch, Russel and Anderson | S2-82423 | 46 | 67.67 | 3112.82 | 2014-02-24 08:35:31 | NaN |
117 | 737550 | Fritsch, Russel and Anderson | S2-82423 | 14 | 81.92 | 1146.88 | 2014-01-03 19:07:37 | NaN |
122 | 737550 | Fritsch, Russel and Anderson | B1-53102 | 23 | 71.56 | 1645.88 | 2014-01-04 08:57:48 | NaN |
134 | 737550 | Fritsch, Russel and Anderson | B1-53636 | 42 | 42.06 | 1766.52 | 2014-01-08 00:02:11 | NaN |
140 | 737550 | Fritsch, Russel and Anderson | S1-27722 | 20 | 29.54 | 590.80 | 2014-01-09 13:20:40 | NaN |
150 | 737550 | Fritsch, Russel and Anderson | S1-93683 | 22 | 71.68 | 1576.96 | 2014-01-11 23:47:36 | NaN |
171 | 737550 | Fritsch, Russel and Anderson | S2-77896 | 41 | 38.43 | 1575.63 | 2014-01-16 22:39:11 | NaN |
177 | 737550 | Fritsch, Russel and Anderson | S2-82423 | 13 | 24.98 | 324.74 | 2014-01-18 06:32:00 | NaN |
209 | 737550 | Fritsch, Russel and Anderson | B1-53102 | 34 | 99.48 | 3382.32 | 2014-01-22 22:00:08 | NaN |
223 | 737550 | Fritsch, Russel and Anderson | B1-65551 | 49 | 19.14 | 937.86 | 2014-01-27 01:59:54 | NaN |
232 | 737550 | Fritsch, Russel and Anderson | S1-65481 | 47 | 45.71 | 2148.37 | 2014-01-28 15:49:45 | NaN |
238 | 737550 | Fritsch, Russel and Anderson | S1-50961 | 1 | 34.81 | 34.81 | 2014-01-30 23:56:20 | NaN |
245 | 737550 | Fritsch, Russel and Anderson | B1-50809 | 20 | 50.11 | 1002.20 | 2014-03-01 23:47:17 | NaN |
258 | 737550 | Fritsch, Russel and Anderson | S2-83881 | 12 | 63.60 | 763.20 | 2014-03-04 15:26:20 | NaN |
336 | 737550 | Fritsch, Russel and Anderson | S2-11481 | 47 | 49.22 | 2313.34 | 2014-03-22 23:44:18 | NaN |
339 | 737550 | Fritsch, Russel and Anderson | S1-93683 | 22 | 18.19 | 400.18 | 2014-03-23 10:33:33 | NaN |
379 | 737550 | Fritsch, Russel and Anderson | B1-65551 | 12 | 56.24 | 674.88 | 2014-03-31 08:43:24 | NaN |
all_data_st['status'].fillna('bronze')all_data_st.head()
| account number | name | sku | quantity | unit price | ext price | date | status |
0 | 383080 | Will LLC | B1-20000 | 7 | 33.69 | 235.83 | 2014-02-01 09:04:59 | NaN |
1 | 412290 | Jerde-Hilpert | S1-27722 | 11 | 21.12 | 232.32 | 2014-02-01 11:51:46 | bronze |
2 | 412290 | Jerde-Hilpert | B1-86481 | 3 | 35.99 | 107.97 | 2014-02-01 17:24:32 | bronze |
3 | 412290 | Jerde-Hilpert | B1-20000 | 23 | 78.90 | 1814.70 | 2014-02-01 19:56:48 | bronze |
4 | 672390 | Kuhn-Gusikowski | S1-06532 | 48 | 55.82 | 2679.36 | 2014-02-02 03:45:20 | silver |
all_data_st['status'] = all_data_st['status'].astype('category')
all_data_st.head()
| account number | name | sku | quantity | unit price | ext price | date | status |
0 | 383080 | Will LLC | B1-20000 | 7 | 33.69 | 235.83 | 2014-02-01 09:04:59 | NaN |
1 | 412290 | Jerde-Hilpert | S1-27722 | 11 | 21.12 | 232.32 | 2014-02-01 11:51:46 | bronze |
2 | 412290 | Jerde-Hilpert | B1-86481 | 3 | 35.99 | 107.97 | 2014-02-01 17:24:32 | bronze |
3 | 412290 | Jerde-Hilpert | B1-20000 | 23 | 78.90 | 1814.70 | 2014-02-01 19:56:48 | bronze |
4 | 672390 | Kuhn-Gusikowski | S1-06532 | 48 | 55.82 | 2679.36 | 2014-02-02 03:45:20 | silver |
all_data_st.dtypes
account number int64name objectsku objectquantity int64unit price float64ext price float64date datetime64[ns]status categorydtype: object
all_data_st.sort_values('status').head()
| account number | name | sku | quantity | unit price | ext price | date | status |
0 | 383080 | Will LLC | B1-20000 | 7 | 33.69 | 235.83 | 2014-02-01 09:04:59 | NaN |
258 | 737550 | Fritsch, Russel and Anderson | S2-83881 | 12 | 63.60 | 763.20 | 2014-03-04 15:26:20 | NaN |
261 | 383080 | Will LLC | S1-93683 | 28 | 90.86 | 2544.08 | 2014-03-05 05:11:49 | NaN |
66 | 737550 | Fritsch, Russel and Anderson | S1-27722 | 15 | 70.23 | 1053.45 | 2014-02-16 18:24:42 | NaN |
201 | 383080 | Will LLC | B1-53636 | 37 | 14.40 | 532.80 | 2014-01-20 22:18:21 | NaN |
all_data_st['status'].cat.set_categories(['gold', 'silver', 'bronze'], inplace=True)
all_data_st.sort_values('status').head()
| account number | name | sku | quantity | unit price | ext price | date | status |
0 | 383080 | Will LLC | B1-20000 | 7 | 33.69 | 235.83 | 2014-02-01 09:04:59 | NaN |
134 | 737550 | Fritsch, Russel and Anderson | B1-53636 | 42 | 42.06 | 1766.52 | 2014-01-08 00:02:11 | NaN |
289 | 383080 | Will LLC | S1-82801 | 3 | 77.06 | 231.18 | 2014-03-11 16:38:10 | NaN |
122 | 737550 | Fritsch, Russel and Anderson | B1-53102 | 23 | 71.56 | 1645.88 | 2014-01-04 08:57:48 | NaN |
299 | 604255 | Halvorson, Crona and Champlin | S1-47412 | 27 | 25.69 | 693.63 | 2014-03-14 16:01:34 | NaN |
all_data_st['status'].describe()
count 307unique 3top silverfreq 140Name: status, dtype: object
all_data_st.groupby(['status'])['quantity', 'unit price', 'ext price'].mean()
| quantity | unit price | ext price |
status | | | |
gold | 24.375000 | 53.723889 | 1351.944583 |
silver | 22.842857 | 57.272714 | 1320.032214 |
bronze | 24.284211 | 59.649684 | 1452.321368 |
all_data_st.groupby(['status'])['quantity', 'unit price', 'ext price'].agg([np.sum, np.mean, np.std])
| quantity | unit price | ext price |
| sum | mean | std | sum | mean | std | sum | mean | std |
status | | | | | | | | | |
gold | 1755 | 24.375000 | 14.575145 | 3868.12 | 53.723889 | 28.740080 | 97340.01 | 1351.944583 | 1182.657312 |
silver | 3198 | 22.842857 | 14.512843 | 8018.18 | 57.272714 | 26.556242 | 184804.51 | 1320.032214 | 1086.384051 |
bronze | 2307 | 24.284211 | 14.812777 | 5666.72 | 59.649684 | 26.530155 | 137970.53 | 1452.321368 | 1140.105645 |
all_data_st.drop_duplicates(subset=['account number', 'name']).ix[:,[0, 1, 7]].groupby(['status'])['name'].count()
statusgold 4silver 7bronze 5Name: name, dtype: int64