mufis-coder/machine-learning-cheatsheet

Cheatsheet Machine Learning

Pandas

read and save file tabular

import pandas as pd

path = "" #string -> the path where the file is located
df = pd.read_csv(path) # read csv file
df = pd.read_excel(path) # read excel file

...

path_out = "" #string -> the path where file output will be saved
df.to_csv(path_out, index=False)

basic info

df.shape

df.info()

df.columns

df["col1"].value_counts()

df.describe()

df.describe().transpose()

select features

#delete columns
df.drop(columns=["col1", "col2", "col3"], inplace=True, axis=1)

#select columns
columns = ["col1", "col2", "col3"]
df = df[columns]

#delete row that contain nan value
df = df.dropna()

select rows

#single condition
df = df[df["cols1"]=="value1"]

#multiple conditions
df = df[df['cols1'].isin(["value1", "value2", "value3"])]

Concat df vertically

df_merged = df1.append([df2, df3], ignore_index=True)

Add columns with value from list

df1['col_new'] = [value1, value2, value3]

dates

#format Year-Month-Day Hour:Minute:Second
df["col_date"] = pd.to_datetime(df['col_date_before'], format="%Y-%m-%d %H:%M:%S")

#date difference in days
dif = (date1-date2).dt.days

null values

df.isnull().sum()

df.isnull().sum() / df.shape[0] * 100.00

df = df.dropna()

make dataframe using dictionary

temp_dict = {"cols1":[], "cols2":[]}

for index, row in df.iterrows():
    temp_dict["cols1"].append(row["cols_df1"])
    temp_dict["cols2"].append(row["cols_df2"])

df_new = pd.DataFrame(temp_dict)

print all str in dataframe

df_train.review_text.values.astype(str)