Friday, November 10, 2017

Two handy helper for Windows user

# get Window path
ss_getwinpath <- function(aPath = "clipboard") {
y <- if (aPath == "clipboard") {
readClipboard()
} else {
cat("Please enter the path:\n\n")
readline()
}

x <- chartr("\\", "/", y)
writeClipboard(x)

return(x)
}

# send object to clipboard for excel
ss_send2excel <- function(aObj) {
write.table(aObj, file="clipboard-16384", sep="\t", row.names=FALSE, col.names=TRUE)
}


Put here: C:\Program Files\R\R-3.4.2\library\base\R\Rprofile
detail for configuration: https://getpocket.com/a/read/717423088

Thursday, November 9, 2017

Write to multiple sheets in R (xlsx, XLConnect and openxlsx)

xlsx works in the end after the JAVA memory issue

 XLConnect and openxlsx doese not work

# deal with JAVA issues
# # Error in .jcall("RJavaTools", "Ljava/lang/Object;", "invokeMethod", cl,  :
# https://stackoverflow.com/questions/21937640/handling-java-lang-outofmemoryerror-when-writing-to-excel-from-r
# https://stackoverflow.com/questions/7019912/using-the-rjava-package-on-win7-64-bit-with-r
options(java.parameters = "-Xmx4g")
options(java.home="C:/Program Files/Java/jdk1.8.0_144/jre/bin/server")
library(rJava)
library(xlsx)

jgc <- function()
{
  gc()
  .jcall("java/lang/System", method = "gc")


if(!file.exists(FILE_OUT_RESULT)){
  file.rename(FILE_OUT_RESULT, FILE_OUT_RESULT_BKP)
}

# map2(dfToSave, dfToSaveSheetName,
#      ~write.xlsx(.x, file=FILE_OUT_RESULT, sheetName=.y,
#                  col.names=TRUE, row.names=TRUE, append=TRUE, showNA=TRUE))

wb <- createWorkbook() 
for(i in seq_along(dfToSave)){
  jgc()
  message("Creating sheet", i)
  sheet <- createSheet(wb, sheetName = dfToSaveSheetName[[i]])
  message("Adding data frame", i)
  addDataFrame(dfToSave[[i]], sheet)
}
saveWorkbook(wb, FILE_OUT_RESULT)



# # XLConnect ---------------------------------------------------------------
# # too loop to response
# options(java.parameters = "-Xmx4g" )
# library(XLConnect)
# outputWB <- loadWorkbook(FILE_OUT_RESULT,  create=TRUE)
# for(i in seq_along(dfToSave)){
#   createSheet(outputWB, name=dfToSaveSheetName[[i]])
# }
#
# for(i in seq_along(dfToSave)){
#   writeWorksheet(outputWB,
#                  dfToSave[[i]],
#                  sheet = dfToSaveSheetName[[i]])
# }
# saveWorkbook(outputWB)
#
# # openxlsx ----------------------------------------------------------------
# #input string 9903 is invalid UTF-8
# library(openxlsx)
# wb <- createWorkbook()
# map(dfToSaveSheetName, ~ addWorksheet(wb, .x))
# map2(dfToSave, dfToSaveSheetName, ~ writeData(wb = wb, sheet = .y, x=.x))
# saveWorkbook(wb, FILE_OUT_RESULT, overwrite = TRUE)

A framework for processing multiple text files


use mget, list2env and map to build the workflow.

#################################
# !diagnostics off
library(tidyverse)
library(readr)
library(stringr)


# Set up dir ------------------------------------------------------------------
DIR_PRJBASE <- 'C:/Users/UserName/ProjectBase'

DIR_SCRIPT <- file.path(DIR_PRJBASE)
setwd(DIR_SCRIPT)

DIR_INPUT <- file.path(DIR_PRJBASE, 'input')
DIR_MIDPUT <- file.path(DIR_PRJBASE, 'midput')
DIR_OUTPUT <- file.path(DIR_PRJBASE, 'output')


# Data steps --------------------------------------------------------------
FILE_SRC_DATA_RAW_1   <- file.path(DIR_INPUT, 'datafile1.csv')
FILE_SRC_DATA_RAW_2 <- file.path(DIR_INPUT, 'datafile2.csv')
FILE_SRC_DATA_RAW_3  <- file.path(DIR_INPUT, 'datafile3.csv')


FILE_OUT_RESULT   <- file.path(DIR_OUTPUT, 'Result.xlsx')
FILE_OUT_RESULT_BKP <- file.path(DIR_OUTPUT, 'Result.xlsx.baK')


# Helpers -----------------------------------------------------------------
step1 <- function(aDf) {
  aDf %>%
    mutate(Email = str_to_lower(str_trim(Email))) ->
    ret
  return(ret)
}

step2 <- function(aDf){
  aDf %>%
    mutate_if(is.character, str_trim, side = 'both') ->
    ret
 
  return(ret)
}

emailSubtract <- function(aSrcDf, aFromDf) {
  aSrcDf %>%
    filter(!(Email %in% aFromDf$Email)) ->
    ret
 
  return(ret)
}


# Load data: Unsubscription ---------------------------------------------------------------
dfRaw_UNSUB <- read.csv(col.names = c('Email'),
                        header = FALSE,
                        FILE_SRC_DATA_RAW_UNSUB)

dfRaw_ENT  <- read_csv(FILE_SRC_DATA_RAW_ENT)
dfRaw_MISC <- read_csv(FILE_SRC_DATA_RAW_MISC)
dfRaw_SMB  <- read_csv(FILE_SRC_DATA_RAW_SMB)


# Clean up ---------------------------------------------------------------
dfNameCore <- c('ENT', 'MISC', 'SMB')
dfRawNames   <- paste0('dfRaw_', dfNameCore)

newColNames <- c('FirstName', 'LastName',
                 'Title',
                 'Email', 'DirectPhone', 'CompanyPhone', 'CompanyName')
list2env(map(mget(dfRawNames), setNames, newColNames), .GlobalEnv)

dfNameCore <- c('UNSUB', dfNameCore)
dfRawNames   <- paste0('dfRaw_', dfNameCore)
dfCleanNames <- paste0('dfClean_', dfNameCore)

mget(dfRawNames) %>%
  map(distinct) %>%
  map(trimAllColumns) %>%
  map(cleanEmail) %>%
  set_names(paste0('dfClean_', dfNameCore)) %>%
  list2env(.GlobalEnv)


stats <- data.frame(DataSet = dfNameCore,
                    RowsInRaw      = map_int(mget(dfRawNames),   nrow),
                    RowsInClean    = map_int(mget(dfCleanNames), nrow),
                    UniqueEmailsInRaw   = map_int(mget(dfRawNames),   ~ length(unique(.x$Email))),
                    UniqueEmailsInClean = map_int(mget(dfCleanNames), ~ length(unique(.x$Email))),
                    row.names = NULL)


# Substract ---------------------------------------------------------------
df_ENT_UNSUB  <- emailSubtract(dfClean_ENT,  dfClean_UNSUB)
df_SMB_UNSUB  <- emailSubtract(dfClean_SMB,  dfClean_UNSUB)
df_MISC_UNSUB <- emailSubtract(dfClean_MISC, dfClean_UNSUB)

dim(dfClean_ENT)[1]  - dim(df_ENT_UNSUB)[1]
dim(dfClean_SMB)[1]  - dim(df_SMB_UNSUB)[1]
dim(dfClean_MISC)[1] - dim(df_MISC_UNSUB)[1]


# Further Substract ---------------------------------------------------------------
df_ENT_UNSUB_MISC <- emailSubtract(df_ENT_UNSUB, df_MISC_UNSUB)
df_SMB_UNSUB_MISC <- emailSubtract(df_SMB_UNSUB, df_MISC_UNSUB)

dim(df_ENT_UNSUB)[1] - dim(df_ENT_UNSUB_MISC)[1]
dim(df_SMB_UNSUB)[1] - dim(df_SMB_UNSUB_MISC)[1]


# Save result -------------------------------------------------------------
dfToSave <- mget(c(dfRawNames, dfCleanNames,
                   c('df_ENT_UNSUB', 'df_SMB_UNSUB', 'df_MISC_UNSUB',
                     'df_ENT_UNSUB_MISC', 'df_SMB_UNSUB_MISC')))
dfToSaveSheetName <- c(paste0('Raw ',dfNameCore), paste0('Clean ',dfNameCore),
                       c('ENT remove UNSUB', 'SMB remove UNSUB', 'MISC remove UNSUB',
                         'ENT remove UNSUB and MISC', 'SMB remove UNSUB and MISC'))

# deal with JAVA issues
# # Error in .jcall("RJavaTools", "Ljava/lang/Object;", "invokeMethod", cl,  :
# https://stackoverflow.com/questions/21937640/handling-java-lang-outofmemoryerror-when-writing-to-excel-from-r
# https://stackoverflow.com/questions/7019912/using-the-rjava-package-on-win7-64-bit-with-r
options(java.parameters = "-Xmx4g")
options(java.home="C:/Program Files/Java/jdk1.8.0_144/jre/bin/server")
library(rJava)
library(xlsx)

jgc <- function()
{
  gc()
  .jcall("java/lang/System", method = "gc")


if(!file.exists(FILE_OUT_RESULT)){
  file.rename(FILE_OUT_RESULT, FILE_OUT_RESULT_BKP)
}

# map2(dfToSave, dfToSaveSheetName,
#      ~write.xlsx(.x, file=FILE_OUT_RESULT, sheetName=.y,
#                  col.names=TRUE, row.names=TRUE, append=TRUE, showNA=TRUE))

wb <- createWorkbook() 
for(i in seq_along(dfToSave)){
  jgc()
  message("Creating sheet", i)
  sheet <- createSheet(wb, sheetName = dfToSaveSheetName[[i]])
  message("Adding data frame", i)
  addDataFrame(dfToSave[[i]], sheet)
}
saveWorkbook(wb, FILE_OUT_RESULT)

Tuesday, September 5, 2017

Use Gmail filter to distribute Tableau dashboard subscription

To extend the users supported by Tableau servers (e.g. default 10 seat license)

Time sequence diagram

R code refactoring: load compressed matrix and retrieve corelation

Load sparse correlation matrix from file

# Original code
relevant_matrix <- (function() {
  m <- matrix(0, 100, 100)
  apply(read.csv("amatrixfile.csv"), 1,
        function(x) { m[x[1], x[2]] <<- x[3] })
  m
})()

# refactored
# Improvement:
# 1. Remove hard coded file name
# 2. Adapt to matrix dimension
# 3. Set diagonals as 1
# 4. Make code reuseable
loadRelavanceMatrix <- function(aFileName){
  rel <- read_csv(aFileName)
  entryDim <- max(c(rel$rawSrc, rel$rawDest))

  ret <- matrix(0, entryDim, entryDim)
  rel %>%
    by_row(function(aRow) {ret[aRow[[1]],aRow[[2]] ]<<- aRow[[3]]})
 # pmap version
  #pmap(list(aRow = relevence_discipline[[1]],            
  #        aCol = relevence_discipline[[2]],
  #       aRel = relevence_discipline[[3]]),
  #   function(aRow, aCol, aRel){
  #     relevant_disciplines_matrix2[aRow, aCol] <<- aRel
   #  })

  diag(ret) <- 1

  return(ret)
}

Retrieve correlation

# Original code
relevant_disciplines <- function(disciplines) {
  if(length(disciplines)==0) { return(NULL) }
  aggregate(
    relevance ~ .,
    rbind(data.frame(id=disciplines, relevance=1.0),
      Reduce(rbind, lapply(disciplines, function(discipline_id) {
        nonzero <- relevant_disciplines_matrix[discipline_id,] > 0
        data.frame(id=which(nonzero),
                   relevance=relevant_disciplines_matrix[discipline_id, nonzero])
      }))),
    max)
}

# Refactored
relevant_disciplines

Friday, June 30, 2017

Extract first component of a string list: dplyr vs. purrr

Goal: Compare two styles of functional programming on parsing string.
Input: The cabin column in Titanic data (Kaggle)
Output: the number extracted from the first components of the space delimited string


Code:
#################################################
library(microbenchmark)
library(tidyverse)
library(stringr)

pc <- microbenchmark(
    purrrV  = {  
        trainDf0 <- trainDfRaw
        trainDf0$Cabin %>%
            as.character() %>%
            map(str_split, " ") %>%
            flatten() %>%
            map_chr(1) %>%
            map_chr(str_replace_all, '[[:alpha:]]','') %>%
            map(as.integer) %>%
            map(coalesce, as.integer(0)) -> trainDf0$Cabin_Number
    },
    dplyV = {
        trainDf1 <- trainDfRaw
        trainDf1 %>% mutate(Cabin_Number = as.character(Cabin)) %>%
            separate(Cabin_Number, into = c("Cabin_Number"), sep = " ",  extra = "drop", remove = TRUE) %>%
            mutate(Cabin_Number = coalesce(as.integer(str_replace(Cabin_Number, "[[:alpha:]]", "")), as.integer(0))) ->
            trainDf1
    }
)

Result
Unit: milliseconds
   expr        min         lq       mean     median        uq        max neval
 purrrV 1363.32589 1405.10168 1607.33881 1429.10920 1561.6362 3453.50236   100
  dplyV   15.69765   16.69651   19.83311   17.46755   19.8518   79.28219   100


dply is much faster than purrr

Sunday, June 11, 2017

Customize Legend in Tableau

Tableau default legend sometimes is too big to fit in the real estate. Thus, we need to use other technique to display a legend. Other use cases include dynamic legend (metric switch or hide when no chart needed), or with extra info (e.g. use bar chart).

Here are some examples:

Name Solution Picture Solution
Bar chart Use bar chart to provide more info

Rotated Rotation to save space


Embedded Put label inside colored shape


Download Tableau workbook (v10.2)

Sunday, January 29, 2017

Add measure to column header

We can add numeric info to the column header to introduce the second measure, or other dimension info as showed in picture below (download workbook).

Steps:
1. Create calculated filed in which we can use STR function to convert number to string, use ATTR to include original dimension break, and use CHAR(10) to insert line break,  e.g. ATTR([Segment])+ CHAR(10) + str([Segment Size]).
 2. Use two dimension as column header. The first is the customized string in step 1, and the second is the original dimension.
3. Uncheck the Show Header on the original dimension.




3 methods to draw wave/break bar chart

When we design bar chart with very different scales along columns, wave/break line can help to call out the scale difference, like this Excel bar chart (original post).


Here I have three methods to draw wave/break bar chart (download workbook):