Cheatsheet Machine Learning
read and save file tabular
import pandas as pd
path = "" #string -> the path where the file is located
df = pd .read_csv (path ) # read csv file
df = pd .read_excel (path ) # read excel file
...
path_out = "" #string -> the path where file output will be saved
df .to_csv (path_out , index = False )
df .shape
df .info ()
df .columns
df ["col1" ].value_counts ()
df .describe ()
df .describe ().transpose ()
#delete columns
df .drop (columns = ["col1" , "col2" , "col3" ], inplace = True , axis = 1 )
#select columns
columns = ["col1" , "col2" , "col3" ]
df = df [columns ]
#delete row that contain nan value
df = df .dropna ()
#single condition
df = df [df ["cols1" ]== "value1" ]
#multiple conditions
df = df [df ['cols1' ].isin (["value1" , "value2" , "value3" ])]
df_merged = df1 .append ([df2 , df3 ], ignore_index = True )
Add columns with value from list
df1 ['col_new' ] = [value1 , value2 , value3 ]
#format Year-Month-Day Hour:Minute:Second
df ["col_date" ] = pd .to_datetime (df ['col_date_before' ], format = "%Y-%m-%d %H:%M:%S" )
#date difference in days
dif = (date1 - date2 ).dt .days
df .isnull ().sum ()
df .isnull ().sum () / df .shape [0 ] * 100.00
df = df .dropna ()
make dataframe using dictionary
temp_dict = {"cols1" :[], "cols2" :[]}
for index , row in df .iterrows ():
temp_dict ["cols1" ].append (row ["cols_df1" ])
temp_dict ["cols2" ].append (row ["cols_df2" ])
df_new = pd .DataFrame (temp_dict )
print all str in dataframe
df_train .review_text .values .astype (str )