use mget, list2env and map to build the workflow.
#################################
# !diagnostics off
library(tidyverse)
library(readr)
library(stringr)
# Set up dir ------------------------------------------------------------------
DIR_PRJBASE <- 'C:/Users/UserName/ProjectBase'
DIR_SCRIPT <- file.path(DIR_PRJBASE)
setwd(DIR_SCRIPT)
DIR_INPUT <- file.path(DIR_PRJBASE, 'input')
DIR_MIDPUT <- file.path(DIR_PRJBASE, 'midput')
DIR_OUTPUT <- file.path(DIR_PRJBASE, 'output')
# Data steps --------------------------------------------------------------
FILE_SRC_DATA_RAW_1 <- file.path(DIR_INPUT, 'datafile1.csv')
FILE_SRC_DATA_RAW_2 <- file.path(DIR_INPUT, 'datafile2.csv')
FILE_SRC_DATA_RAW_3 <- file.path(DIR_INPUT, 'datafile3.csv')
FILE_OUT_RESULT <- file.path(DIR_OUTPUT, 'Result.xlsx')
FILE_OUT_RESULT_BKP <- file.path(DIR_OUTPUT, 'Result.xlsx.baK')
# Helpers -----------------------------------------------------------------
step1 <- function(aDf) {
aDf %>%
mutate(Email = str_to_lower(str_trim(Email))) ->
ret
return(ret)
}
step2 <- function(aDf){
aDf %>%
mutate_if(is.character, str_trim, side = 'both') ->
ret
return(ret)
}
emailSubtract <- function(aSrcDf, aFromDf) {
aSrcDf %>%
filter(!(Email %in% aFromDf$Email)) ->
ret
return(ret)
}
# Load data: Unsubscription ---------------------------------------------------------------
dfRaw_UNSUB <- read.csv(col.names = c('Email'),
header = FALSE,
FILE_SRC_DATA_RAW_UNSUB)
dfRaw_ENT <- read_csv(FILE_SRC_DATA_RAW_ENT)
dfRaw_MISC <- read_csv(FILE_SRC_DATA_RAW_MISC)
dfRaw_SMB <- read_csv(FILE_SRC_DATA_RAW_SMB)
# Clean up ---------------------------------------------------------------
dfNameCore <- c('ENT', 'MISC', 'SMB')
dfRawNames <- paste0('dfRaw_', dfNameCore)
newColNames <- c('FirstName', 'LastName',
'Title',
'Email', 'DirectPhone', 'CompanyPhone', 'CompanyName')
list2env(map(mget(dfRawNames), setNames, newColNames), .GlobalEnv)
dfNameCore <- c('UNSUB', dfNameCore)
dfRawNames <- paste0('dfRaw_', dfNameCore)
dfCleanNames <- paste0('dfClean_', dfNameCore)
mget(dfRawNames) %>%
map(distinct) %>%
map(trimAllColumns) %>%
map(cleanEmail) %>%
set_names(paste0('dfClean_', dfNameCore)) %>%
list2env(.GlobalEnv)
stats <- data.frame(DataSet = dfNameCore,
RowsInRaw = map_int(mget(dfRawNames), nrow),
RowsInClean = map_int(mget(dfCleanNames), nrow),
UniqueEmailsInRaw = map_int(mget(dfRawNames), ~ length(unique(.x$Email))),
UniqueEmailsInClean = map_int(mget(dfCleanNames), ~ length(unique(.x$Email))),
row.names = NULL)
# Substract ---------------------------------------------------------------
df_ENT_UNSUB <- emailSubtract(dfClean_ENT, dfClean_UNSUB)
df_SMB_UNSUB <- emailSubtract(dfClean_SMB, dfClean_UNSUB)
df_MISC_UNSUB <- emailSubtract(dfClean_MISC, dfClean_UNSUB)
dim(dfClean_ENT)[1] - dim(df_ENT_UNSUB)[1]
dim(dfClean_SMB)[1] - dim(df_SMB_UNSUB)[1]
dim(dfClean_MISC)[1] - dim(df_MISC_UNSUB)[1]
# Further Substract ---------------------------------------------------------------
df_ENT_UNSUB_MISC <- emailSubtract(df_ENT_UNSUB, df_MISC_UNSUB)
df_SMB_UNSUB_MISC <- emailSubtract(df_SMB_UNSUB, df_MISC_UNSUB)
dim(df_ENT_UNSUB)[1] - dim(df_ENT_UNSUB_MISC)[1]
dim(df_SMB_UNSUB)[1] - dim(df_SMB_UNSUB_MISC)[1]
# Save result -------------------------------------------------------------
dfToSave <- mget(c(dfRawNames, dfCleanNames,
c('df_ENT_UNSUB', 'df_SMB_UNSUB', 'df_MISC_UNSUB',
'df_ENT_UNSUB_MISC', 'df_SMB_UNSUB_MISC')))
dfToSaveSheetName <- c(paste0('Raw ',dfNameCore), paste0('Clean ',dfNameCore),
c('ENT remove UNSUB', 'SMB remove UNSUB', 'MISC remove UNSUB',
'ENT remove UNSUB and MISC', 'SMB remove UNSUB and MISC'))
# deal with JAVA issues
# # Error in .jcall("RJavaTools", "Ljava/lang/Object;", "invokeMethod", cl, :
# https://stackoverflow.com/questions/21937640/handling-java-lang-outofmemoryerror-when-writing-to-excel-from-r
# https://stackoverflow.com/questions/7019912/using-the-rjava-package-on-win7-64-bit-with-r
options(java.parameters = "-Xmx4g")
options(java.home="C:/Program Files/Java/jdk1.8.0_144/jre/bin/server")
library(rJava)
library(xlsx)
jgc <- function()
{
gc()
.jcall("java/lang/System", method = "gc")
}
if(!file.exists(FILE_OUT_RESULT)){
file.rename(FILE_OUT_RESULT, FILE_OUT_RESULT_BKP)
}
# map2(dfToSave, dfToSaveSheetName,
# ~write.xlsx(.x, file=FILE_OUT_RESULT, sheetName=.y,
# col.names=TRUE, row.names=TRUE, append=TRUE, showNA=TRUE))
wb <- createWorkbook()
for(i in seq_along(dfToSave)){
jgc()
message("Creating sheet", i)
sheet <- createSheet(wb, sheetName = dfToSaveSheetName[[i]])
message("Adding data frame", i)
addDataFrame(dfToSave[[i]], sheet)
}
saveWorkbook(wb, FILE_OUT_RESULT)