import pandas as pd
import numpy as np
import os
from pandas import DataFrame
pd.set_option('display.float_format', lambda x:'%f'%x)
data= pd.read_csv("http://makemeanalyst.com/python-data-science/gapminder.csv", low_memory=False)
print(len(data)) #Get no of rows in the data set
data.head()#Just print fews rows to see how the data looks like
print(data.dtypes)
colnames=data.columns.values.tolist() #Get all the column names
colnames2=colnames[1:len(data.columns)]
for i in colnames2:
data[i]=pd.to_numeric(data[i], errors='coerce')
data[colnames[0]]=data[colnames[0]].astype('category')
print(data.dtypes) #Now you will get all the column datatypes as your choice
print(data['incomeperperson'].isnull().sum()) #No of NA is 23
print(data['urbanrate'].isnull().sum()) #No of NA is 10
print(data['employrate'].isnull().sum()) #NO of NA is 35
print(data.isnull().sum())
data=data.dropna(subset=['incomeperperson','urbanrate','employrate'], how='all')
print(len(data))
mean=data[['incomeperperson','urbanrate','employrate']].mean()
mean=np.round(mean, decimals=2)
print(mean)
data['incomeperperson']=data['incomeperperson'].replace(np.nan,mean[0])
data['urbanrate']=data['urbanrate'].replace(np.nan,mean[1])
data['employrate']=data['employrate'].replace(np.nan,mean[2])
data=data.iloc[:,1:]
data=data[['incomeperperson','urbanrate','employrate']]
print(data.head())
print(data.shape)
data['factor_income']=pd.cut(data['incomeperperson'],[0,1000,12735, data.ix[:,['incomeperperson']].max()],
labels=['Lower Income','Middle Income','Upper Income'])
print ('counts for each incomeperperson')
c1 = data['factor_income'].value_counts(sort=False, dropna=False)
print(c1)
c5 = data['factor_income'].value_counts(sort=False, normalize=True)
print(c5)
data['factor_urbanrate']=pd.qcut(data['urbanrate'],4, labels=["1","2","3","4"])
print ('counts for each urban rate')
c2 = data['factor_urbanrate'].value_counts(sort=False, dropna=False)
c3 = data['factor_urbanrate'].value_counts(sort=False, normalize=True)#It will display percentage of data falls in each catagory
print(c2) #Here you will get the frequncy of each catagory as well as number of NA values
print("Percentage for each catagory\n",c3)
data['EmpRateCatogory'] =pd.cut( data['employrate'],[0,50,70,data.ix[:,['employrate']].max()],
labels=['Low','Average','High'])
c4 = data['EmpRateCatogory'].value_counts(sort=False, dropna=False)
print(c4)
c6 = data['EmpRateCatogory'].value_counts(sort=False, normalize=True)
print("Percentage for each catagory\n",c6)