Generating-Metadata-and-Preprocessing: A Python repository from Das00130

This was a coursework project which is written in python 2.6 and no external libraries were used.

Generating metadata:

Read from the config file the instructions (file to open, columns to pre-process...)
Extract the data from a file: can be a .csv, a .txt or a .json
Find the number of rows and fields
Use regex to determine the data type: determine the min/max (if numeric) and the unique values (if string)
Generate a file with the meta-data information

Pre-process data:

Dropping null values
Filling missing values
Sorting (asc or desc)
Normalising
Validating credit card numbers (using regex)

Notes

Once the repository is downloaded, open CMD and type: python process.py config.json

The process.py file contains the whole program. It executes with the main() function which further calls other functions in a dedicated order.
The config.json file has all the information and instructions related to the operations to be performed on the file.

Metadata file Output:

Preprocessed file output:

Given below is the code with description of every line

'''Generating meta-data'''

import sys
import csv
import json
import os
import re
import ast
from collections import OrderedDict

def readConfig(configfile):  # This fuction reads the config file
    with open(configfile) as con:      # the config file is opened as con
                fullfile = json.load(con) 
    return fullfile # Config file is returned 

def readFileAsCSV(l):                   # This Function reads the CSV file and takes the input from the Config.json
    myfile = l.get("inputfile")          # Here we get the name of the inputfile fron Config.json
    with open (myfile) as csvfile:       # file is opened
        datainCSVfile = []               # empty list   
        fileinfo =csv.reader(csvfile)    # Reading the info from the file 
        fileinfo = list(fileinfo)        # Converting the fileinfo into list
        count = 0
        for i in fileinfo:              # iterating through the whole file
            if(count>0):                
                datainCSVfile.append(i)    # appendinf every row in the list datainfile
            else:
                headersinCSVfile = i             # getting all the headers 
                count += 1                   # if the seperator is empty    # read the file from the top
        data = {'headers':headersinCSVfile,'bodydata':datainCSVfile}   # Storing all the information as a dictionary 
    return data     # returning the dictionary 
    
def readFileAsJSON(l): 
    myfile = l.get("inputfile")               # This function reads the JSON file and gets the input from Config.json
    datainJSONfile = []                        # Created an empty list to store all the data
    headersinJSONfile = []                     # Created an empty list to store the fieldnames
    with open(myfile ,'r') as jsonfile:        # Open the Json file
        alldata = json.load(jsonfile, object_pairs_hook=OrderedDict)        # Load the Json file
    count = 0                                  
    typeofdata = type(alldata)                 # Checking the type and storing in another variable
    if typeofdata==type({}):                   # Comparing the type of variable with dictionary type
        headersinJSONfile = alldata.keys()     # Getting all the keys and storing in a list
        datainJSONfile.append(alldata.values()) # Getting all the values and appending in the list
    elif typeofdata==type([]):                  # Comparng the type of variable with list
        for i in alldata:                      # iterating through the whole file 
            if(count==0):
                headersinJSONfile = i.keys()    #Getting the fieldnames and storing in a list
            else:
                datainJSONfile.append(i.values()) # Getting all the rows and appending in a list.
            count += 1
    data = {'headers':headersinJSONfile,'bodydata':datainJSONfile} # Created a dictionary and stored the output in it
    
    return data     # Returning the dictionary

def ColumnIsNum(col): # This function checks if the particular column is numeric or not
    col = str(col) 
    result = False
    colregex = re.compile(r'[-+]?\d*\.?\d*')  # Regex to check the column is numeric   
    d = re.findall(colregex,col)
    if(len(d)>0):
        if(d[0]==col):
            if(d[0] != '.'):
                result = True
    return result # Returns the result as true if the column is numeric

def ColumnisNull(column): # This function takes the input as all the column data 
    column = str(column)
    column = column.upper()  # Converts every column info into upper
    if column == 'NULL' or column == 'NAN':
        return True # Returns true if it matches Null or Nan
    
def infoEveryColumn(columndata,columnname): # In his function we evaluate every column and perform required operation on them
    everycolumnoutput = {}    #Empty dictionary which stores the output of every column
    count = 0
    countOfNum =0
    countOfStr =0
    stringcolumndata = {}
    for data in columndata:   # Iterating through all the columns
        if ColumnisNull(data): # Here we check if the column is null or not 
            count += 1   #  Here we count the number of null columns
            continue
        if ColumnIsNum(data):  # Here we check if the column is numeric or not
            if(countOfNum==0):# if the counter is equal to 0 then convert the values to float and assign to maximum and minimum 
                Maximumofcolumn = float(data)
                Minimumofcolumn = float(data)
            else:
                value = float(data)   # Here we calculate the maximun and minimun of every numeric row
                if value>Maximumofcolumn:
                    Maximumofcolumn =  value
                if value<Minimumofcolumn:
                    Minimumofcolumn =  value
            countOfNum += 1
        else:
            stringcolumndata[data] = True  # Check every column for string data
            countOfStr += 1
    if(countOfNum>countOfStr):
        everycolumnoutput.update({'type' : 'numeric','max':Maximumofcolumn,'min':Minimumofcolumn})
    else:
        everycolumnoutput.update({'type':'string','uniquevals':len(stringcolumndata)})
    return everycolumnoutput # Returns a dictionary of keys and values for the specific type of Colmns.

def generateMetadata(l, data): 
    myfile = l.get("inputfile")  
    metadataoutput=OrderedDict() # Created one empty dictionary to store the info from this function
    metadataoutput.update({"filename": str(myfile)})
    metadataoutput.update({'NumberofRows':len(data['bodydata'])})  # Here we update the dictionary with the no of rows. We have a list of rows stored in the key 'bodydata', we apply function to calculate length of the list.
    metadataoutput.update({'Numberoffields':len(data['headers'])}) # Here we update the dictionary with the no of colmns We have a list of columns stored in the key 'headers', we apply function to calculate length of the list.
    dialect = csv.Sniffer().sniff(str(data['bodydata'][1]))
    filename, ext = os.path.splitext(myfile)# Here we get the name of the file from our main dictionary and update it here.
    if ext =='.csv': # Here we check if the format is tabular
        metadataoutput.update({"Format":{"type":'tabular',"Separators":str(dialect.delimiter)}}) # If its tabular, we update the the dictionary with the type and the seperator of the file
    elif ext =='json':  # Here we check if the format is json
        metadataoutput.update({"Format":{"type":'json'}})  # If its json we just update the type as we dont need any seperator in json format
    count = 0                    
    fieldstype ={}  # Here we create one more dictionary to store the information regarding every column i.e type,max,min,unique
    for fieldname in data['headers']: # Here we iterate through every column
        collist = [row[count] for row in data['bodydata']] #Here we get information of the whole column according the column names
        res = infoEveryColumn(collist,fieldname) # Here res is a dictionary which gets the output from another function in the form of column name, type and maximum,minimum
        fieldstype.update({fieldname:res})  # Here we update the dictionary with the information in the above step 
        count += 1
    metadataoutput.update({'fields':fieldstype}) # Here we update our metadat dictionary with all the outputs from fieldstype dictionry
    return metadataoutput

'''Pre-processing data'''

def CreateJSONFile(l, mydict):  # This funcion writes a Json file, it gets the input as 
    with open(l.get('metafile'),'w') as f:   # Open the file as 
        json.dump(mydict,f, indent=4,sort_keys=False)
    

def dropnull(l, data): # This function drops the rows, where particular columnn has null or nan 
    dropnullfields = l['preprocess']['missing']['dropnull']  # Here we get the info of the columns 
    evaldic = ast.literal_eval(json.dumps(dropnullfields)) # Here we convert the unicode values into strings
    fullcolumn = data['bodydata']   # Here we have all the info of the row
    UpdatedDataofrows = [data['headers']]
    goodRows = []
    index = data['headers'].index(evaldic[0]) # Here we search for the column in the data
    for record in fullcolumn:  # Here we iterate in the data and append the rows if the ColisNull function returns False
        if ColumnisNull(record[index]) is not True:  #
            goodRows.append(record[0])
    for record in fullcolumn:
        if record[0] in goodRows:  
            UpdatedDataofrows.append(record)  # Appening the rows in a list
    return UpdatedDataofrows # Returning the list with updated dropped rows
  
def fillnull(l, data): # This function fills all the null values in the particular columns and input are Config.json and List of data from dropnul function
    fillnullfields = l['preprocess']['missing']['fillnull']
    evaldic = ast.literal_eval(json.dumps(fillnullfields)) 
    vals0 = evaldic.values()[0]    #Extracting the name of the columns and default values 
    vals1 = evaldic.values()[1]
    keys0 =  evaldic.keys()[0]
    keys1 =  evaldic.keys()[1]
    getindex0 = data[0].index(keys0)
    getindex1 = data[0].index(keys1)
    fullcolumndata=[data[0]]
    for record in data[1:]:   # Iteratinf through the data and checking the null columns and replacing null or nan with the default values 
        if ColumnisNull(record[getindex1]): 
            record[getindex1] = vals1
        if ColumnisNull(record[getindex0]):
            record[getindex0] = vals0
        fullcolumndata.append(record)   # Appending every updated column in the list
    return fullcolumndata # Returning the list with the updated columns

def normalise(l, data, w): # add appropriate args
    getNormcolum = l['preprocess']['normalise'][0]
    i = data[0].index(getNormcolum)
    # Here we get the info of the columns we need to normalize  # Here we iterate through particular columns
    maximum = w['fields'][getNormcolum]['max']   #  Here we get the maximun from that particular  column
    minimum = w['fields'][getNormcolum]['min'] 
    normaliseddatafull = [data[0]]#  Here we get the minimum from that particular column
    for item in data[1:]: # Here we iterate through all the column data
        fvalue = float(item[i]) # We convert all the values of that column into float, inorder to perform arithmetic funtions
        item[i] = (fvalue - minimum) / (maximum - minimum) # This the formla i used for normalizing the column
        normaliseddatafull.append(item)
    return normaliseddatafull

def sortData(l, data):  # This function sorts the data according to the particular column, it takes input as Config file and the output of normalized data 
    sortorder = str(l['preprocess']['sorting']['order'])
    sortfield = str(l['preprocess']['sorting']['field'])
    columnindex = data[0].index(sortfield)
    if sortorder == 'desc':
        sorteddataoutput=sorted(data,key=lambda l:l[columnindex], reverse=True)  # Sorted function sorts the columns in desc if the reverse is false
    if sortorder == 'asce':
        sorteddataoutput=sorted(data,key=lambda l:l[columnindex], reverse=False) # Sorted function sorts the columns in desc if the reverse is True
    sorteddataoutput[0] = data[0]
    return sorteddataoutput # Returns the output as a sorted list according to the columns
 
def validation(l, data):# This function validates the particular column
    if l['preprocess']['validate_card'] == True:   # if the config.json has value as true then perform the operations else do nothing
        myfile = l.get("inputfile")   #  Getting the name of the file  
        filename, ext = os.path.splitext(myfile)  # Getting the extension of the file
        string1 = r"(^[456])"  # The debit card number starts with 4,5 or 6
        string2 = r"(\d{4})(-?)(\d{4})(\2\d{4}){2}"  # only numbers,may include hyphen or space
        string3 = r"((\d)(?!\2{3})){16}"  # no consequetive 4 numbers and 16 digits
        index = data[0].index('debitcard')
        listdam=[]
        if ext == ".csv":  
            for x in range(1,len(data)):
                if re.match(string1, data[x][index]) and re.match(string2, data[x][index]): # matching the data with the regex
                    k = data[x][index]
                    k = k.replace("-", "")  # Replacing hyphen with space to check the regex
                    if re.match(string3, k):
                        listdam.append(data[x][0])  # apending the list with the Id's of the rows which matched
        if ext == ".json" :
            for x in range(1, len(data)):
                if re.match(string1, str(data[x][index])) and re.match(string2, str(data[x][index])):
                    k = str(data[x][index])
                    #Removal of the spaces or hyphens to make the last check easier
                    k = k.replace("-", "")  # Replacing hyphen with space to check the regex
                    if re.match(string3, k):
                        listdam.append(data[x][0])    #list will contain the id of the rows which passed the debit card test
        return listdam
    else:
        return data

def removeInvalidDebitCardEntries(l, data, w):   # This function removes all the invalid card entries. It takes input as sorted data and list of ID's which matched regex
    cleanlist=[data[0]]
    for x in range(1, len(data)):
        if data[x][0] in w:  
            cleanlist.append(data[x])    # Adding all the values to the list which matched perfectly
    return cleanlist
 
def preprocessData(config, data):  # This function is used to preprocess the data
    UpdatedDataofrows = dropnull(config, data)  #  This function drops the rows which have column with nul or nan
    fullcolumndata = fillnull(config, UpdatedDataofrows) # This function rplaces the null values with 0 and no for particular columns
    metadataoutput = generateMetadata(config, data)
    normaliseddatafull = normalise(config, fullcolumndata, metadataoutput) # This function
    sorteddataoutput = sortData(config, normaliseddatafull)
    listdam = validation(config, sorteddataoutput)
    preprocesseddata = removeInvalidDebitCardEntries(config, sorteddataoutput, listdam)
    return preprocesseddata

'''The main function'''

def main(configFile):  # This is the main function which gets the input as the config.json file
    config = readConfig(configFile)    #   Reading the config file
    myfile = config.get("inputfile")   # Getting the inputfile name frrom Config.json
    filename, ext = os.path.splitext(myfile)   #  Getting the extension of the file by splitting the filename
    if ext == ".json":      
        data = readFileAsJSON(config)    #   Reads the file as JSON
    if ext == ".csv":
        data = readFileAsCSV(config)     #   Reads the file as CSV
    metadataoutput = generateMetadata(config, data)   #Generating metadata 
    CreateJSONFile(config, metadataoutput)     #  Create the metadata file
    preprocesseddata = preprocessData(config, data)   #  Generate preprocessed data
    with open(config.get("outputfile"), mode="w") as out :  # Getting the name of the outputfile and write the data to it.
        clean = csv.writer(out, delimiter=',')
        for line in preprocesseddata:
            clean.writerow(line)
    return 

if __name__ == '__main__':
    if(len(sys.argv)>1):
        configFile = sys.argv[1]
    else:
        configFile = "config.json"
    main(configFile)

Das00130/Generating-Metadata-and-Preprocessing

Generating metadata:

Pre-process data:

Notes

Metadata file Output:

Preprocessed file output:

Given below is the code with description of every line