import pandas as pd
import numpy as np
import datetime as dt


df = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name = [1,2,3,4])


df[1]


new_header_1 = df[1].iloc[0] 
transactions = df[1][1:]
transactions.columns = new_header_1
transactions = transactions.rename_axis(None, axis = 1)
transactions.head()


transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 1 to 20000
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   transaction_id           20000 non-null  object
 1   product_id               20000 non-null  object
 2   customer_id              20000 non-null  object
 3   transaction_date         20000 non-null  object
 4   online_order             19640 non-null  object
 5   order_status             20000 non-null  object
 6   brand                    19803 non-null  object
 7   product_line             19803 non-null  object
 8   product_class            19803 non-null  object
 9   product_size             19803 non-null  object
 10  list_price               20000 non-null  object
 11  standard_cost            19803 non-null  object
 12  product_first_sold_date  19803 non-null  object
dtypes: object(13)
memory usage: 2.0+ MB


#change the currency variables to float
transactions['list_price'] = pd.to_numeric(transactions['list_price'], errors='coerce')
transactions['standard_cost'] = pd.to_numeric(transactions['standard_cost'], errors='coerce')
#change the date variable to datetime
transactions['transaction_date'] =  pd.to_datetime(transactions['transaction_date'], format='%Y-%m-%d')


transactions.isna().sum()

transaction_id               0
product_id                   0
customer_id                  0
transaction_date             0
online_order               360
order_status                 0
brand                      197
product_line               197
product_class              197
product_size               197
list_price                   0
standard_cost              197
product_first_sold_date    197
dtype: int64


#remove 197 missing values from the dataframe
transactions.dropna(subset=['brand'], inplace=True)
transactions.reset_index(drop=True, inplace=True)


transactions['online_order'].value_counts()

True     9739
False    9706
Name: online_order, dtype: int64


#fill missing values with the last valid observation forward
transactions['online_order'] = transactions['online_order'].fillna(method='ffill')


#change the figures in "product_first_sold_date" into date
lst = []
for x in np.array(transactions['product_first_sold_date']):
    lst.append(dt.datetime(1899, 12, 30) + dt.timedelta(days= x))
transactions['product_first_sold_date'] = np.array(lst)


transactions['product_first_sold_date'] =  pd.to_datetime(transactions['product_first_sold_date'], format='%Y-%m-%d')


df[2]


new_header_2 = df[2].iloc[0] 
newcust = df[2][1:]
newcust.columns = new_header_2
newcust = newcust.rename_axis(None, axis = 1)
newcust.head()


newcust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 1 to 1000
Data columns (total 23 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   first_name                           1000 non-null   object 
 1   last_name                            971 non-null    object 
 2   gender                               1000 non-null   object 
 3   past_3_years_bike_related_purchases  1000 non-null   object 
 4   DOB                                  983 non-null    object 
 5   job_title                            894 non-null    object 
 6   job_industry_category                835 non-null    object 
 7   wealth_segment                       1000 non-null   object 
 8   deceased_indicator                   1000 non-null   object 
 9   owns_car                             1000 non-null   object 
 10  tenure                               1000 non-null   object 
 11  address                              1000 non-null   object 
 12  postcode                             1000 non-null   object 
 13  state                                1000 non-null   object 
 14  country                              1000 non-null   object 
 15  property_valuation                   1000 non-null   object 
 16  nan                                  1000 non-null   float64
 17  nan                                  1000 non-null   float64
 18  nan                                  1000 non-null   float64
 19  nan                                  1000 non-null   float64
 20  nan                                  1000 non-null   float64
 21  Rank                                 1000 non-null   object 
 22  Value                                1000 non-null   object 
dtypes: float64(5), object(18)
memory usage: 179.8+ KB


#drop unnecessary columns
newcust.drop(newcust.columns[[16, 17, 18, 19, 20]], inplace = True, axis = 1)


#change the numerical variables to float
newcust['tenure'] = pd.to_numeric(newcust['tenure'], errors='coerce')
newcust['Rank'] = pd.to_numeric(newcust['Rank'], errors='coerce')
newcust['Value'] = pd.to_numeric(newcust['Value'], errors='coerce')
#change the date variable to datetime
newcust['DOB'] =  pd.to_datetime(newcust['DOB'], format='%Y-%m-%d')


newcust['gender'].value_counts()

Female    513
Male      470
U          17
Name: gender, dtype: int64


newcust.isna().sum()

first_name                               0
last_name                               29
gender                                   0
past_3_years_bike_related_purchases      0
DOB                                     17
job_title                              106
job_industry_category                  165
wealth_segment                           0
deceased_indicator                       0
owns_car                                 0
tenure                                   0
address                                  0
postcode                                 0
state                                    0
country                                  0
property_valuation                       0
Rank                                     0
Value                                    0
dtype: int64


#create a name variable that contains both first name and last name
newcust['name'] = newcust['first_name'] + ' ' + newcust['last_name'].fillna('')
newcust.drop(['first_name', 'last_name'], inplace = True, axis = 1)


#Remove missing values in DOB 
newcust.dropna(subset=['DOB'], inplace=True)
newcust.reset_index(drop=True, inplace=True)


newcust['job_title'].value_counts()

Environmental Tech          14
Software Consultant         14
Associate Professor         14
Chief Design Engineer       13
Cost Accountant             12
                            ..
Web Developer II             1
Statistician III             1
Systems Administrator IV     1
Accounting Assistant IV      1
Web Developer I              1
Name: job_title, Length: 184, dtype: int64


#fill missing values with the last valid observation forward
newcust['job_title'] = newcust['job_title'].fillna(method='ffill')


newcust['job_industry_category'].value_counts()

Financial Services    202
Manufacturing         199
Health                152
Retail                 78
Property               64
Entertainment          36
IT                     36
Argiculture            26
Telecommunications     25
Name: job_industry_category, dtype: int64


#fill missing values with the last valid observation forward
newcust['job_industry_category'] = newcust['job_industry_category'].fillna(method='ffill')


df[3]


new_header_3 = df[3].iloc[0] 
custdemo = df[3][1:]
custdemo.columns = new_header_3
custdemo = custdemo.rename_axis(None, axis = 1)
custdemo.head()


custdemo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 1 to 4000
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   customer_id                          4000 non-null   object
 1   first_name                           4000 non-null   object
 2   last_name                            3875 non-null   object
 3   gender                               4000 non-null   object
 4   past_3_years_bike_related_purchases  4000 non-null   object
 5   DOB                                  3913 non-null   object
 6   job_title                            3494 non-null   object
 7   job_industry_category                3344 non-null   object
 8   wealth_segment                       4000 non-null   object
 9   deceased_indicator                   4000 non-null   object
 10  default                              3698 non-null   object
 11  owns_car                             4000 non-null   object
 12  tenure                               3913 non-null   object
dtypes: object(13)
memory usage: 406.4+ KB


#change the numerical variable to float
custdemo['tenure'] = pd.to_numeric(custdemo['tenure'], errors='coerce')
#change the date variable to datetime
custdemo['DOB'] =  pd.to_datetime(custdemo['DOB'], format='%Y-%m-%d')


custdemo.isna().sum()

customer_id                              0
first_name                               0
last_name                              125
gender                                   0
past_3_years_bike_related_purchases      0
DOB                                     87
job_title                              506
job_industry_category                  656
wealth_segment                           0
deceased_indicator                       0
default                                302
owns_car                                 0
tenure                                  87
dtype: int64


#create a name variable that contains both first name and last name
custdemo['name'] = custdemo['first_name'] + ' ' + custdemo['last_name'].fillna('')
custdemo.drop(['first_name', 'last_name'], inplace = True, axis = 1)


#Remove missing values in DOB 
custdemo.dropna(subset=['DOB'], inplace=True)
custdemo.reset_index(drop=True, inplace=True)


#fill missing values with the last valid observation forward
custdemo['job_title'] = custdemo['job_title'].fillna(method='ffill')


#fill missing values with the last valid observation forward
custdemo['job_industry_category'] = custdemo['job_industry_category'].fillna(method='ffill')


custdemo['DOB'].sort_values().head()

33     1843-12-21
704    1931-10-23
1065   1935-08-22
3334   1940-09-22
2361   1943-08-11
Name: DOB, dtype: datetime64[ns]


#remove outliers from DOB column
custdemo.drop(custdemo[custdemo['DOB'] == '1843-12-21'].index, inplace = True)
custdemo = custdemo.reset_index(drop= True)


custdemo['gender'].value_counts()

Female    2037
Male      1872
F            1
Femal        1
M            1
Name: gender, dtype: int64


#rename value in gender column
custdemo['gender'].replace('F', 'Female',inplace=True)
custdemo['gender'].replace('M', 'Male',inplace=True)
custdemo['gender'].replace('Femal', 'Female',inplace=True)


#remove "Y" value from decreased_indicator column
custdemo.drop(custdemo[custdemo['deceased_indicator'] == 'Y'].index, inplace = True)
custdemo = custdemo.reset_index(drop= True)


#drop default variable
custdemo.drop(['default'], inplace = True, axis = 1)


df[4]


new_header_4 = df[4].iloc[0] 
address = df[4][1:]
address.columns = new_header_4
address = address.rename_axis(None, axis = 1)
address.head()


address.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3999 entries, 1 to 3999
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         3999 non-null   object
 1   address             3999 non-null   object
 2   postcode            3999 non-null   object
 3   state               3999 non-null   object
 4   country             3999 non-null   object
 5   property_valuation  3999 non-null   object
dtypes: object(6)
memory usage: 187.6+ KB


#### Completeness


address.isna().sum()

customer_id           0
address               0
postcode              0
state                 0
country               0
property_valuation    0
dtype: int64


address['state'].value_counts()

NSW                2054
VIC                 939
QLD                 838
New South Wales      86
Victoria             82
Name: state, dtype: int64


#rename value in state column
address['state'].replace('New South Wales', 'NSW',inplace=True)
address['state'].replace('Victoria', 'VIC',inplace=True)


transactions.drop(transactions[transactions['customer_id'] == 5034].index, inplace = True)
transactions = transactions.reset_index(drop= True)
address.drop(address[(address['customer_id'] == 4001) | (address['customer_id'] == 4002) | (address['customer_id'] == 4003)].index, inplace = True)
address = address.reset_index(drop= True)


writer = pd.ExcelWriter('KPMG_task1.xlsx')
transactions.to_excel(writer, index=False, sheet_name='Transactions')
newcust.to_excel(writer, index=False, sheet_name='NewCustomerList')
custdemo.to_excel(writer, index=False, sheet_name='CustomerDemographic')
address.to_excel(writer, index=False, sheet_name='CustomerAddress')
writer.save()

	Note: The data and information in this document is reflective of a hypothetical situation and client. This document is to be used for KPMG Virtual Internship purposes only.	Unnamed: 1	Unnamed: 2	Unnamed: 3	Unnamed: 4	Unnamed: 5	Unnamed: 6	Unnamed: 7	Unnamed: 8	Unnamed: 9	Unnamed: 10	Unnamed: 11	Unnamed: 12
0	transaction_id	product_id	customer_id	transaction_date	online_order	order_status	brand	product_line	product_class	product_size	list_price	standard_cost	product_first_sold_date
1	1	2	2950	2017-02-25 00:00:00	False	Approved	Solex	Standard	medium	medium	71.49	53.62	41245
2	2	3	3120	2017-05-21 00:00:00	True	Approved	Trek Bicycles	Standard	medium	large	2091.47	388.92	41701
3	3	37	402	2017-10-16 00:00:00	False	Approved	OHM Cycles	Standard	low	medium	1793.43	248.82	36361
4	4	88	3135	2017-08-31 00:00:00	False	Approved	Norco Bicycles	Standard	medium	medium	1198.46	381.1	36145
...	...	...	...	...	...	...	...	...	...	...	...	...	...
19996	19996	51	1018	2017-06-24 00:00:00	True	Approved	OHM Cycles	Standard	high	medium	2005.66	1203.4	37823
19997	19997	41	127	2017-11-09 00:00:00	True	Approved	Solex	Road	medium	medium	416.98	312.74	35560
19998	19998	87	2284	2017-04-14 00:00:00	True	Approved	OHM Cycles	Standard	medium	medium	1636.9	44.71	40410
19999	19999	6	2764	2017-07-03 00:00:00	False	Approved	OHM Cycles	Standard	high	medium	227.88	136.73	38216
20000	20000	11	1144	2017-09-22 00:00:00	True	Approved	Trek Bicycles	Standard	medium	small	1775.81	1580.47	36334

	transaction_id	product_id	customer_id	transaction_date	online_order	order_status	brand	product_line	product_class	product_size	list_price	standard_cost	product_first_sold_date
1	1	2	2950	2017-02-25 00:00:00	False	Approved	Solex	Standard	medium	medium	71.49	53.62	41245
2	2	3	3120	2017-05-21 00:00:00	True	Approved	Trek Bicycles	Standard	medium	large	2091.47	388.92	41701
3	3	37	402	2017-10-16 00:00:00	False	Approved	OHM Cycles	Standard	low	medium	1793.43	248.82	36361
4	4	88	3135	2017-08-31 00:00:00	False	Approved	Norco Bicycles	Standard	medium	medium	1198.46	381.1	36145
5	5	78	787	2017-10-01 00:00:00	True	Approved	Giant Bicycles	Standard	medium	large	1765.3	709.48	42226

	Note: The data and information in this document is reflective of a hypothetical situation and client. This document is to be used for KPMG Virtual Internship purposes only.	Unnamed: 1	Unnamed: 2	Unnamed: 3	Unnamed: 4	Unnamed: 5	Unnamed: 6	Unnamed: 7	Unnamed: 8	Unnamed: 9	...	Unnamed: 13	Unnamed: 14	Unnamed: 15	Unnamed: 16	Unnamed: 17	Unnamed: 18	Unnamed: 19	Unnamed: 20	Unnamed: 21	Unnamed: 22
0	first_name	last_name	gender	past_3_years_bike_related_purchases	DOB	job_title	job_industry_category	wealth_segment	deceased_indicator	owns_car	...	state	country	property_valuation	NaN	NaN	NaN	NaN	NaN	Rank	Value
1	Chickie	Brister	Male	86	1957-07-12	General Manager	Manufacturing	Mass Customer	N	Yes	...	QLD	Australia	6	0.56	0.7000	0.875000	0.743750	1.0	1	1.71875
2	Morly	Genery	Male	69	1970-03-22	Structural Engineer	Property	Mass Customer	N	No	...	NSW	Australia	11	0.89	0.8900	1.112500	0.945625	1.0	1	1.71875
3	Ardelis	Forrester	Female	10	1974-08-28 00:00:00	Senior Cost Accountant	Financial Services	Affluent Customer	N	No	...	VIC	Australia	5	1.01	1.0100	1.010000	1.010000	1.0	1	1.71875
4	Lucine	Stutt	Female	64	1979-01-28	Account Representative III	Manufacturing	Affluent Customer	N	Yes	...	QLD	Australia	1	0.87	1.0875	1.087500	1.087500	4.0	4	1.703125
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
996	Ferdinand	Romanetti	Male	60	1959-10-07	Paralegal	Financial Services	Affluent Customer	N	No	...	NSW	Australia	7	0.79	0.7900	0.790000	0.790000	996.0	996	0.374
997	Burk	Wortley	Male	22	2001-10-17	Senior Sales Associate	Health	Mass Customer	N	No	...	NSW	Australia	10	0.76	0.7600	0.950000	0.807500	997.0	997	0.357
998	Melloney	Temby	Female	17	1954-10-05	Budget/Accounting Analyst IV	Financial Services	Affluent Customer	N	Yes	...	QLD	Australia	2	0.85	1.0625	1.062500	1.062500	997.0	997	0.357
999	Dickie	Cubbini	Male	30	1952-12-17	Financial Advisor	Financial Services	Mass Customer	N	Yes	...	QLD	Australia	2	1.09	1.3625	1.362500	1.158125	997.0	997	0.357
1000	Sylas	Duffill	Male	56	1955-10-02	Staff Accountant IV	Property	Mass Customer	N	Yes	...	NSW	Australia	9	0.47	0.5875	0.734375	0.624219	1000.0	1000	0.34

	first_name	last_name	gender	past_3_years_bike_related_purchases	DOB	job_title	job_industry_category	wealth_segment	deceased_indicator	owns_car	...	state	country	property_valuation	NaN	NaN	NaN	NaN	NaN	Rank	Value
1	Chickie	Brister	Male	86	1957-07-12	General Manager	Manufacturing	Mass Customer	N	Yes	...	QLD	Australia	6	0.56	0.7000	0.8750	0.743750	1.0	1	1.71875
2	Morly	Genery	Male	69	1970-03-22	Structural Engineer	Property	Mass Customer	N	No	...	NSW	Australia	11	0.89	0.8900	1.1125	0.945625	1.0	1	1.71875
3	Ardelis	Forrester	Female	10	1974-08-28 00:00:00	Senior Cost Accountant	Financial Services	Affluent Customer	N	No	...	VIC	Australia	5	1.01	1.0100	1.0100	1.010000	1.0	1	1.71875
4	Lucine	Stutt	Female	64	1979-01-28	Account Representative III	Manufacturing	Affluent Customer	N	Yes	...	QLD	Australia	1	0.87	1.0875	1.0875	1.087500	4.0	4	1.703125
5	Melinda	Hadlee	Female	34	1965-09-21	Financial Analyst	Financial Services	Affluent Customer	N	No	...	NSW	Australia	9	0.52	0.5200	0.6500	0.650000	4.0	4	1.703125

	Note: The data and information in this document is reflective of a hypothetical situation and client. This document is to be used for KPMG Virtual Internship purposes only.	Unnamed: 1	Unnamed: 2	Unnamed: 3	Unnamed: 4	Unnamed: 5	Unnamed: 6	Unnamed: 7	Unnamed: 8	Unnamed: 9	Unnamed: 10	Unnamed: 11	Unnamed: 12
0	customer_id	first_name	last_name	gender	past_3_years_bike_related_purchases	DOB	job_title	job_industry_category	wealth_segment	deceased_indicator	default	owns_car	tenure
1	1	Laraine	Medendorp	F	93	1953-10-12 00:00:00	Executive Secretary	Health	Mass Customer	N	"'	Yes	11
2	2	Eli	Bockman	Male	81	1980-12-16 00:00:00	Administrative Officer	Financial Services	Mass Customer	N	<script>alert('hi')</script>	Yes	16
3	3	Arlin	Dearle	Male	61	1954-01-20 00:00:00	Recruiting Manager	Property	Mass Customer	N	2018-02-01 00:00:00	Yes	15
4	4	Talbot	NaN	Male	33	1961-10-03 00:00:00	NaN	IT	Mass Customer	N	() { _; } >_[$($())] { touch /tmp/blns.shellsh...	No	7
...	...	...	...	...	...	...	...	...	...	...	...	...	...
3996	3996	Rosalia	Halgarth	Female	8	1975-08-09 00:00:00	VP Product Management	Health	Mass Customer	N	-100	No	19
3997	3997	Blanch	Nisuis	Female	87	2001-07-13 00:00:00	Statistician II	Manufacturing	High Net Worth	N	â¦testâ§	Yes	1
3998	3998	Sarene	Woolley	U	60	NaN	Assistant Manager	IT	High Net Worth	N	NaN	No	NaN
3999	3999	Patrizius	NaN	Male	11	1973-10-24 00:00:00	NaN	Manufacturing	Affluent Customer	N	Â¡â¢Â£Â¢âÂ§Â¶â¢ÂªÂºââ	Yes	10
4000	4000	Kippy	Oldland	Male	76	1991-11-05 00:00:00	Software Engineer IV	NaN	Affluent Customer	N	0/0	No	11

	Note: The data and information in this document is reflective of a hypothetical situation and client. This document is to be used for KPMG Virtual Internship purposes only.	Unnamed: 1	Unnamed: 2	Unnamed: 3	Unnamed: 4	Unnamed: 5
0	customer_id	address	postcode	state	country	property_valuation
1	1	060 Morning Avenue	2016	New South Wales	Australia	10
2	2	6 Meadow Vale Court	2153	New South Wales	Australia	10
3	4	0 Holy Cross Court	4211	QLD	Australia	9
4	5	17979 Del Mar Point	2448	New South Wales	Australia	4
...	...	...	...	...	...	...
3995	3999	1482 Hauk Trail	3064	VIC	Australia	3
3996	4000	57042 Village Green Point	4511	QLD	Australia	6
3997	4001	87 Crescent Oaks Alley	2756	NSW	Australia	10
3998	4002	8194 Lien Street	4032	QLD	Australia	7
3999	4003	320 Acker Drive	2251	NSW	Australia	7

KPMG Data Analytics Virtual Internship

Task 1: Data Quality Assessment

Contents

1. Transactions table¶

First Look At The Data¶

Comment:¶

Change the header¶

Data types¶

Completeness¶

Comment:¶

Relevancy¶

Comment:¶

2. NewCustomerList table¶

First Look At The Data¶

Change the header¶

Data types¶

Consistency¶

Comment:¶

Completeness¶

Comment:¶

3. CustomerDemographic table¶

First Look At The Data¶

Change the header¶

Data types¶

Completeness¶

Comment:¶

Accuracy¶

Comment:¶

Consistency¶

Comment:¶

Validity¶

Comment:¶

Relevancy¶

Comment:¶

4. CustomerAddress table¶

First Look At The Data¶

Change the header¶

Data types¶

Comment:¶

Consistency¶

Comment:¶

Additional values¶

Comment:¶

Save file¶