class create_data_dictionary:
def __init__(self):
'''This class provides functions to quickly develop a data dictionary for your data set'''
return None
def make_my_data_dictionary(self, dataFrame):
'''Create an initial data dictionary excluding definitions for meaning of features'''
col_ = dataFrame.columns
df_DataDict = {}
for col in col_:
df_DataDict[col] = {
'Type': str(df.dtypes[col]),
'Length': len(df[col]),
'Null_Count': sum(df[col].isna()),
'Size(Memory)': df.memory_usage()[col],
'Definition': str('')
}
df_DD = pd.DataFrame(df_DataDict)
return df_DD
def define_data_meaning(self, df_data_dictionary):
'''Quickly provide input regarding each columns meaning and transpose into a usable dictionary'''
col_ = df_data_dictionary.columns
d = 'Definition'
for col in col_:
df_data_dictionary[col][d] = input('Provide a data definition for {}'.format(col))
df_data_dictionary = df_data_dictionary.transpose()
return df_data_dictionary
def update_dd_definition(self, df_data_dictionary, attribute):
try:
df_dd = df_data_dictionary.transpose()
df_dd[attribute]['Definition'] = input('Provide a data definition for {}'.format(attribute))
df_dd = df_dd.transpose()
return df_dd
except:
print('Sorry, there was an error. Check attribute name and try again')
Example use case:
df = pd.read_csv('Some data you have')
dd = create_data_dictionary()
df_dd = dd.make_my_data_dictionary(df)
df_dd = dd.define_data_meaning(df_dd)
print(df_dd)
Output will be your data dictionary. You can always go back and update your definitions by simply calling:
df_dd = dd.update_dd_definition(df_dd, 'Name of one of your data dictionary attributes')