"""
Created on Sun Sep 3 01:17:41 2017
@author: Kalyan
"""
import pandas as pd
import numpy as np
from pandas import DataFrame
import os
data= pd.read_csv("http://makemeanalyst.com/python-data-science/gapminder.csv", low_memory=False)
data.head()
print("No of Records : ",len(data)) #This will show number of observations
print("No of Features : ",len(data.columns)) #This will show number of Features
print(data.dtypes)
data['incomeperperson']=data['incomeperperson'].convert_objects(convert_numeric=True)
data['urbanrate']=data['urbanrate'].convert_objects(convert_numeric=True)
data['employrate']=data['employrate'].convert_objects(convert_numeric=True)
print(data.dtypes)
sub1=data[np.isnan(data['incomeperperson']) & np.isnan(data['urbanrate']) & np.isnan(data['employrate'])]
sub1
data['factor_income']=pd.cut(data['incomeperperson'],[0,1000,12735, data.ix[:,['incomeperperson']].max()],
labels=['Lower Income','Middle Income','Upper Income'])
data.head()
print ('counts for original incomeperperson')
c1 = data['factor_income'].value_counts(sort=False, dropna=False)
print(c1) #Here you will get the frequncy of each catagory as well as number of NA values
print(data['factor_income'].describe())
data['factor_urbanrate']=pd.qcut(data['urbanrate'],4, labels=["1=0%tile","2=25%tile","3=50%tile","4=75%tile"])
print ('counts for original incomeperperson')
c2 = data['factor_urbanrate'].value_counts(sort=False, dropna=False)
c3 = data['factor_urbanrate'].value_counts(sort=False, dropna=False,normalize=True)#It will display percentage of data
#falls in each catagory
print(c2) #Here you will get the frequncy of each catagory as well as number of NA values
print("Percentage for each catagory\n",c3*100)
data['EmpRateCatogory'] =pd.cut( data['employrate'],[0,50,70,data.ix[:,['employrate']].max()],
labels=['Low','Average','High'])
c4 = data['EmpRateCatogory'].value_counts(sort=False, dropna=False)
print(c4)