# coding: utf-8
import re
import pandas as pd
pd.set_option('display.max_colwidth', -1)
#PREPROCESSING
#1)Copy paste only csv data from pastebin to notepad and save it as csv [data.csv].
#2)Open data.csv file in excel convert body field as string field and delete columns with
# empty fields then save file as csv format[cleandata.csv].
#3)Open that cleandata.csv file in notepad and save it as cleandata-utf8.csv as utf-8 format.
df = pd.read_csv("cleandata-utf8.csv")
df = df.drop(['date','smstype','smsflag'],axis=1)
df.head()
df['body'].head()
df['number'].unique()
#by examining all unique message title we found that every sms number column contain bank name
#like SBI in DZ-CBSSBI,TM-SBIINB etc.
#from here we extract all common bank code name in number column carefully
#Main bank code pattern : YESBNK,BOIIND,SBI,AXISBK,HDFCBK,ICICIB,PNB
#Note: look at that NaN number field carefully. It may contain some transaction details
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)
df[df['number'].isna()]
#dont worry this is just a remainder of some pay due
#Collect All possible different templates for best accura
#Other problem
#Repeated messages
#all message not show card or account information ,so this information is incomplete.
#Distorted messages like Chq 012740 for INR 1,746
df['Credit/Debit'] = None
df['Available_balance'] = None
df['Bank']= None
df.head()
total_msg = df.shape[0]
print("Total messages:",total_msg)
#regular expression for debit
debit1=r'.*Transfer of.*?\.(\d+).*Fees \(Incl of ST\): Rs.(\d+)'
debit2=r'.*Thank you for using your SBI Debit Card .*for a purchase worth Rs(\d+).(\d+) on POS.*'
debit3=r'.*Dear Customer, break up of consolidated charges for.*: Monthly Service Fee: Rs (\d+).*'
debit4=r'.*Rs (\d+) withdrawn from A/c .* on .* at .* .Avl bal Rs (\d+).(\d+).*'
debit5=r'.*Rs (\d+) withdrawn at .*from A/c .* on .*\.Avl bal Rs (\d+).(\d+).*'
debit6=r'.*Dear Customer, break up of consolidated charges for .*for a/c no. .*: ~Value Added SMS Alert Fee: Rs (\d+) ~Total: Rs (\d+) \(excluding GST\).*'
debit7=r'.*Your A/C .*has a debit by transfer of Rs (\d+).(\d+) on.*Avl Bal Rs (\d+).(\d+).*'
debit8=r'.*Your AC .*Debited INR (\d+).(\d+) on .*Avl Bal INR (.*)\.Plz download Buddy.*'
debit9=r'.*Thank you for using your SBI Debit Card .*for a purchase worth Rs(\d+) on POS.*'
debit10=r'.*OTP is .*for txn of INR (\d+).(\d+) at .*on card ending .*'
debit11=r'.*Rs.(\d+).(\d+) was spent on ur HDFCBank CREDIT Card ending.* on .*at .*Avl bal - Rs.(\d+).(\d+).*'
#regular expression for credit
credit1=r'.*Your AC .*Credited INR (\d+).(\d+) on.*Avl Bal INR (\d+).(\d+).*'
credit2=r'.*Dear HDFCBank cardmember, Payment of Rs (\d+) was credited to your card ending.*'
credit3=r'.*DEAR CARDMEMBER, PAYMENT OF Rs. (\d+).(\d+) RECEIVED TOWARDS YOUR CREDIT CARD ENDING .*THROUGH.*YOUR AVAILABLE LIMIT IS RS.*'
credit4=r'.*Your a/c no.*is credited by Rs.(\d+).(\d+) on .*by a/c linked to mobile.*'
credit5=r'.*Your A/C .*Credited INR (.*) on .*Deposit of Cash at .*ATM. A/c Balance INR (.*)'
#balance
bal1=r'.*Avail Bal in A/c .*: Rs. (\d+).(\d+) CR.*'
for i in range(total_msg):
text = str(df.iloc[i][1])
match1 = re.match(debit1,text)
match2 = re.match(debit2,text)
match3 = re.match(debit3,text)
match4 = re.match(debit4,text)
match5 = re.match(debit5,text)
match6 = re.match(debit6,text)
match7 = re.match(debit7,text)
match8 = re.match(debit8,text)
match9 = re.match(debit9,text)
match10 = re.match(debit10,text)
match11 = re.match(debit11,text)
match12 = re.match(credit1,text)
match13 = re.match(credit2,text)
match14 = re.match(credit3,text)
match15 = re.match(credit4,text)
match16 = re.match(credit5,text)
match17 = re.match(bal1,text)
if match1:
debit = float(match1.group(1))+float(match1.group(2))
df.loc[i,'Credit/Debit']=-debit
print(i)
elif match2:
debit=float(match2.group(1))+float('.'+match2.group(2))
df.loc[i,'Credit/Debit']=-debit
elif match3:
debit=float(match3.group(1))
df.loc[i,'Credit/Debit']=-debit
elif match4:
debit=float(match4.group(1))
bal = float(match4.group(2))+float('.'+match4.group(3))
df.loc[i,'Credit/Debit']=-debit
df.loc[i,'Available_balance']=bal
elif match5:
debit=float(match5.group(1))
bal = float(match5.group(2))+float('.'+match5.group(3))
df.loc[i,'Credit/Debit']=-debit
df.loc[i,'Available_balance']=bal
elif match6:
debit=float(match6.group(2))
df.loc[i,'Credit/Debit']=-debit
elif match7:
debit=float(match7.group(1))+float('.'+match7.group(2))
bal = float(match7.group(3))+float('.'+match7.group(4))
df.loc[i,'Credit/Debit']=-debit
df.loc[i,'Available_balance']=bal
elif match8:
debit=float(match8.group(1))+float('.'+match8.group(2))
bal = (float(re.sub(',','',match8.group(3))))
df.loc[i,'Credit/Debit']=-debit
df.loc[i,'Available_balance']=bal
elif match9:
debit=float(match9.group(1))
df.loc[i,'Credit/Debit']=-debit
elif match10:
debit=float(match10.group(1))+float('.'+match10.group(2))
df.loc[i,'Credit/Debit']=-debit
elif match11:
debit=float(match11.group(1))+float('.'+match11.group(2))
bal = float(match11.group(3))+float('.'+match11.group(4))
df.loc[i,'Credit/Debit']=-debit
df.loc[i,'Available_balance']=bal
elif match12:
credit=float(match12.group(1))+float('.'+match12.group(2))
bal = float(match12.group(3))+float('.'+match12.group(4))
df.loc[i,'Credit/Debit']=credit
df.loc[i,'Available_balance']=bal
elif match13:
credit = float(match13.group(1))
df.loc[i,'Credit/Debit']=credit
df.loc[i,'Available_balance']=bal
elif match14:
credit=float(match14.group(1))+float('.'+match14.group(2))
df.loc[i,'Credit/Debit']=credit
elif match15:
credit=float(match15.group(1))+float('.'+match15.group(2))
df.loc[i,'Credit/Debit']=credit
elif match16:
credit = (float(re.sub(',','',match16.group(1))))
bal = (float(re.sub(',','',match16.group(2))))
df.loc[i,'Credit/Debit']=credit
df.loc[i,'Available_balance']=bal
elif match17:
bal = float(match17.group(1))+float('.'+match17.group(2))
df.loc[i,'Available_balance']=bal
else:
df.loc[i,'Credit/Debit']=None
df.loc[i,'Available_balance']=None
message = str(df.iloc[i][2])
match_bank1=re.match(r'.*SBI.*',message)
match_bank2=re.match(r'.*ICICI.*',message)
match_bank3=re.match(r'.*HDFC.*',message)
match_bank4=re.match(r'.*PNB.*',message)
match_bank5=re.match(r'.*BOI.*',message)
match_bank6=re.match(r'.*YESBNK.*',message)
match_bank7=re.match(r'.*AXIS.*',message)
if(match_bank1):
df.loc[i,'Bank']='SBI'
elif(match_bank2):
df.loc[i,'Bank']='ICICI'
elif(match_bank3):
df.loc[i,'Bank']='HDFC'
elif(match_bank4):
df.loc[i,'Bank']='PNB'
elif(match_bank5):
df.loc[i,'Bank']='BOI'
elif(match_bank6):
df.loc[i,'Bank']='YESBNK'
elif(match_bank7):
df.loc[i,'Bank']='AXIS'
df
# Total amount of transaction (debit + credit) for different types of accounts
df2 = df.copy()
df2['Credit/Debit'].replace('None', 0, inplace=True)
df2['Credit/Debit'] = df2['Credit/Debit'].abs()
df2.groupby(['user_id','Bank'])['Credit/Debit'].sum()
# Account Balance for different account best recognise by message with date.
# Here Date of message are encoded so little problem in arrange message according to date.
# Some account not have their Aval Balance message .