trinker/textclean

replace_time and replace_money functions

trinker opened this issue · 5 comments

12:47 to "twelve forty-seven" and $3.16 into "three dollars, sixteen cents."

Replace_date too? Formats would have to be limited.

replace_time <- function(x, pattern = '(2[0-3]|[01]?[0-9]):([0-5][0-9])[.:]?([0-5]?[0-9])?', 
    replacement = function(y){
        z <- replace_number(unlist(strsplit(y, '[:.]')))
        if(!is.na(z[3])) z[3] <- paste0('and ', z[3], ' seconds')
        paste(z, collapse = ' ')
    }, ...){

    if (is.function(replacement)) {
        f_gsub <- fgsub
    } else { 
        f_gsub <- stringi::stri_replace_all_regex
    }

    f_gsub(x, pattern, replacement)

}

x <- c(NA, '12:47 to "twelve forty-seven" and also 8:35:02', 'what about 14:24.5', 'And then 99:99:99?')

replace_time(x)


replace_time(x, replacement = '<<TIME>>')

replace_time(x, replacement = function(y){
        z <- unlist(strsplit(y, '[:.]'))
        z[1] <- 'hh'
        z[2] <- 'mm'
        if(!is.na(z[3])) z[3] <- 'ss'
        collapse(z, ':')
    }
)

replace_time(x, replacement = function(y){
        z <- replace_number(unlist(strsplit(y, '[:.]')))
        z[3] <- paste0('and ', ifelse(is.na(z[3]), '0', z[3]), ' seconds')
        paste(z, collapse = ' ')
    }
)

replace_time(x, replacement = function(y){
        z <- unlist(strsplit(y, '[:.]'))
        z[1] <- 'hh'
        z[2] <- 'mm'
        z[3] <- 'ss'
        collapse(z, ':')
    }
)

replace_money <- function(x, pattern = '(-?)([$])([0-9,]+)(\\.\\d{2})?', 
    replacement = function(x, dec = decimal){

        sign <- ifelse(grepl('^-', x), 'negative ', '')
        if (grepl('\\.', x)) {
            number <- replace_number(gsub('\\.', paste0(' dollars', dec), gsub('(-?)([$])', '', x)))
            paste0(sign, number, ' cents')
        } else {
            number <- replace_number(gsub('(-?)([$])', '', x))
            paste0(sign, number)
        }

    }, decimal = ' and ', ...){

    if (is.function(replacement)) {
        f_gsub <- fgsub
    } else { 
        f_gsub <- stringi::stri_replace_all_regex
    }

    f_gsub(x, pattern, replacement)

}

x <- c(NA, '$3.16 into "three dollars, sixteen cents"', "-$20,333.18 too", 'fff')

replace_money(x)
replace_money(x, decimal = ', ')
replace_money(x, replacement = '<<MONEY>>')
replace_date <- function(x, 
    pattern = c('([01]?[0-9])[/-]([0-2]?[0-9]|3[01])[/-]\\d{4}|\\d{4}[/-]([01]?[0-9])[/-]([0-2]?[0-9]|3[01])'), 
    replacement = function(x){

        parts <- strsplit(gsub('(^.+)([/-])(\\d{4})', '\\3\\2\\1', x, perl = TRUE), '[/-]')[[1]]

        y <- replace_number(parts[1])
        m <- month.name[as.integer(parts[2])]
        d <- english::ordinal(as.integer(parts[3]))
        paste0(m, ' ', d, ', ', y)

    }, ...){

    if (is.function(replacement)) {
        f_gsub <- fgsub
    } else { 
        f_gsub <- stringi::stri_replace_all_regex
    }

    f_gsub(x, pattern, replacement)

}

x <- c(NA, '11-16-1980 and 11/16/1980', 'and 2017-02-08 but then there\'s 2/8/2017 too')

replace_date(x)
replace_date(x, replacement = '<<DATE>>')

Still need to included example in readme