Pandas is mostly used just to read data, fix missing values, mask and change it to numpy array.
It is also used to quickly visualize data from files
import pandas as pd
import numpy as np
pd.__version__
Returns a padas.DataFrame object
pd.read_csv('files/data_csv.csv')
#if the json is in format
#{column1 : [values_list], column2 : [values_list]}
pd.read_json('files/data_json.json')
#Or in format [{col1:val,col2:val}, {col1:val,col2:val}]
pd.read_json('files/data_json_records.json',orient='records')
df = pd.read_csv('files/data_csv.csv')
type(df)
df["Salary"]
df[["Country","Salary"]]
df.describe()
Works just like numpy
#Masking just like numpy
df[df['Salary']>60000]
#Get the numpy array
df.values
import numpy as np
x = np.linspace(0,10,100000).reshape(-1,1) #otherwise we cant concat along axis 1 (use vstack instead?)
y = (np.linspace(0,10,100000) + np.random.normal(0,0.01,100000)).reshape(-1,1)
df = pd.DataFrame(np.concatenate((x,y),axis=1),columns=["X","Y"])
df.head() #shows the first five elements
df.sample(n=10) #samples 10 random elements
#More on data visualization refresher. It will be difficult to plot million points. so we plot a sample
%matplotlib inline
df.sample(n=25).plot(x='X',y='Y',kind='scatter')
diabetes = pd.read_csv("files/pima-indians-diabetes.csv")
diabetes.head()
Lets just normalize 'Number_pregnant' and 'Age' columns
diabetes.columns
cols_to_normalize = ['Number_pregnant','Age']
diabetes[cols_to_normalize].head()
#When axis=0(default), applies function to each column
#When axis=1, applies function to each row
fn = lambda x: (x-x.min())/(x.max()-x.min())
diabetes[cols_to_normalize] = diabetes[cols_to_normalize].apply(fn,axis=0)
diabetes.head()