Cheatsheet Machine Learning

Pandas

  • read and save file tabular
import pandas as pd

path = "" #string -> the path where the file is located
df = pd.read_csv(path) # read csv file
df = pd.read_excel(path) # read excel file

...

path_out = "" #string -> the path where file output will be saved
df.to_csv(path_out, index=False)
  • basic info
df.shape

df.info()

df.columns

df["col1"].value_counts()

df.describe()

df.describe().transpose()
  • select features
#delete columns
df.drop(columns=["col1", "col2", "col3"], inplace=True, axis=1)

#select columns
columns = ["col1", "col2", "col3"]
df = df[columns]

#delete row that contain nan value
df = df.dropna()
  • select rows
#single condition
df = df[df["cols1"]=="value1"]

#multiple conditions
df = df[df['cols1'].isin(["value1", "value2", "value3"])]
  • Concat df vertically
df_merged = df1.append([df2, df3], ignore_index=True)
  • Add columns with value from list
df1['col_new'] = [value1, value2, value3]
  • dates
#format Year-Month-Day Hour:Minute:Second
df["col_date"] = pd.to_datetime(df['col_date_before'], format="%Y-%m-%d %H:%M:%S")

#date difference in days
dif = (date1-date2).dt.days
  • null values
df.isnull().sum()

df.isnull().sum() / df.shape[0] * 100.00

df = df.dropna()
  • make dataframe using dictionary
temp_dict = {"cols1":[], "cols2":[]}

for index, row in df.iterrows():
    temp_dict["cols1"].append(row["cols_df1"])
    temp_dict["cols2"].append(row["cols_df2"])

df_new = pd.DataFrame(temp_dict)
  • print all str in dataframe
df_train.review_text.values.astype(str)