Sean's Data Analysis Note: 2017

Friday, November 10, 2017

Two handy helper for Windows user

# get Window path

ss_getwinpath <- function(aPath = "clipboard") {
y <- if (aPath == "clipboard") {
readClipboard()
} else {
cat("Please enter the path:\n\n")
readline()
}

x <- chartr("\\", "/", y)
writeClipboard(x)

return(x)
}

# send object to clipboard for excel
ss_send2excel <- function(aObj) {
write.table(aObj, file="clipboard-16384", sep="\t", row.names=FALSE, col.names=TRUE)
}

Put here: C:\Program Files\R\R-3.4.2\library\base\R\Rprofile

detail for configuration: https://getpocket.com/a/read/717423088

Thursday, November 9, 2017

Write to multiple sheets in R (xlsx, XLConnect and openxlsx)

xlsx works in the end after the JAVA memory issue

XLConnect and openxlsx doese not work

# deal with JAVA issues
# # Error in .jcall("RJavaTools", "Ljava/lang/Object;", "invokeMethod", cl, :
# https://stackoverflow.com/questions/21937640/handling-java-lang-outofmemoryerror-when-writing-to-excel-from-r
# https://stackoverflow.com/questions/7019912/using-the-rjava-package-on-win7-64-bit-with-r
options(java.parameters = "-Xmx4g")
options(java.home="C:/Program Files/Java/jdk1.8.0_144/jre/bin/server")
library(rJava)
library(xlsx)

jgc <- function()
{
gc()
.jcall("java/lang/System", method = "gc")
}

if(!file.exists(FILE_OUT_RESULT)){
file.rename(FILE_OUT_RESULT, FILE_OUT_RESULT_BKP)
}

# map2(dfToSave, dfToSaveSheetName,
#      ~write.xlsx(.x, file=FILE_OUT_RESULT, sheetName=.y,
#                  col.names=TRUE, row.names=TRUE, append=TRUE, showNA=TRUE))

wb <- createWorkbook()
for(i in seq_along(dfToSave)){
jgc()
message("Creating sheet", i)
sheet <- createSheet(wb, sheetName = dfToSaveSheetName[[i]])
message("Adding data frame", i)
addDataFrame(dfToSave[[i]], sheet)
}
saveWorkbook(wb, FILE_OUT_RESULT)

# # XLConnect ---------------------------------------------------------------
# # too loop to response
# options(java.parameters = "-Xmx4g" )
# library(XLConnect)
# outputWB <- loadWorkbook(FILE_OUT_RESULT, create=TRUE)
# for(i in seq_along(dfToSave)){
#   createSheet(outputWB, name=dfToSaveSheetName[[i]])
# }
#
# for(i in seq_along(dfToSave)){
#   writeWorksheet(outputWB,
#                  dfToSave[[i]],
#                  sheet = dfToSaveSheetName[[i]])
# }
# saveWorkbook(outputWB)
#
# # openxlsx ----------------------------------------------------------------
# #input string 9903 is invalid UTF-8
# library(openxlsx)
# wb <- createWorkbook()
# map(dfToSaveSheetName, ~ addWorksheet(wb, .x))
# map2(dfToSave, dfToSaveSheetName, ~ writeData(wb = wb, sheet = .y, x=.x))
# saveWorkbook(wb, FILE_OUT_RESULT, overwrite = TRUE)

A framework for processing multiple text files

use mget, list2env and map to build the workflow.

#################################
# !diagnostics off
library(tidyverse)
library(readr)
library(stringr)

# Set up dir ------------------------------------------------------------------
DIR_PRJBASE <- 'C:/Users/UserName/ProjectBase'

DIR_SCRIPT <- file.path(DIR_PRJBASE)
setwd(DIR_SCRIPT)

DIR_INPUT <- file.path(DIR_PRJBASE, 'input')
DIR_MIDPUT <- file.path(DIR_PRJBASE, 'midput')
DIR_OUTPUT <- file.path(DIR_PRJBASE, 'output')

# Data steps --------------------------------------------------------------
FILE_SRC_DATA_RAW_1   <- file.path(DIR_INPUT, 'datafile1.csv')
FILE_SRC_DATA_RAW_2 <- file.path(DIR_INPUT, 'datafile2.csv')
FILE_SRC_DATA_RAW_3 <- file.path(DIR_INPUT, 'datafile3.csv')

FILE_OUT_RESULT   <- file.path(DIR_OUTPUT, 'Result.xlsx')
FILE_OUT_RESULT_BKP <- file.path(DIR_OUTPUT, 'Result.xlsx.baK')

# Helpers -----------------------------------------------------------------
step1 <- function(aDf) {
aDf %>%
    mutate(Email = str_to_lower(str_trim(Email))) ->
    ret
return(ret)
}

step2 <- function(aDf){
aDf %>%
    mutate_if(is.character, str_trim, side = 'both') ->
    ret

return(ret)
}

emailSubtract <- function(aSrcDf, aFromDf) {
aSrcDf %>%
    filter(!(Email %in% aFromDf$Email)) ->
    ret

return(ret)
}

# Load data: Unsubscription ---------------------------------------------------------------
dfRaw_UNSUB <- read.csv(col.names = c('Email'),
                        header = FALSE,
                        FILE_SRC_DATA_RAW_UNSUB)

dfRaw_ENT <- read_csv(FILE_SRC_DATA_RAW_ENT)
dfRaw_MISC <- read_csv(FILE_SRC_DATA_RAW_MISC)
dfRaw_SMB <- read_csv(FILE_SRC_DATA_RAW_SMB)

# Clean up ---------------------------------------------------------------
dfNameCore <- c('ENT', 'MISC', 'SMB')
dfRawNames   <- paste0('dfRaw_', dfNameCore)

newColNames <- c('FirstName', 'LastName',
                 'Title',
                 'Email', 'DirectPhone', 'CompanyPhone', 'CompanyName')
list2env(map(mget(dfRawNames), setNames, newColNames), .GlobalEnv)

dfNameCore <- c('UNSUB', dfNameCore)
dfRawNames   <- paste0('dfRaw_', dfNameCore)
dfCleanNames <- paste0('dfClean_', dfNameCore)

mget(dfRawNames) %>%
map(distinct) %>%
map(trimAllColumns) %>%
map(cleanEmail) %>%
set_names(paste0('dfClean_', dfNameCore)) %>%
list2env(.GlobalEnv)

stats <- data.frame(DataSet = dfNameCore,
                    RowsInRaw      = map_int(mget(dfRawNames),   nrow),
                    RowsInClean    = map_int(mget(dfCleanNames), nrow),
                    UniqueEmailsInRaw   = map_int(mget(dfRawNames),   ~ length(unique(.x$Email))),
                    UniqueEmailsInClean = map_int(mget(dfCleanNames), ~ length(unique(.x$Email))),
                    row.names = NULL)

# Substract ---------------------------------------------------------------
df_ENT_UNSUB <- emailSubtract(dfClean_ENT, dfClean_UNSUB)
df_SMB_UNSUB <- emailSubtract(dfClean_SMB, dfClean_UNSUB)
df_MISC_UNSUB <- emailSubtract(dfClean_MISC, dfClean_UNSUB)

dim(dfClean_ENT)[1] - dim(df_ENT_UNSUB)[1]
dim(dfClean_SMB)[1] - dim(df_SMB_UNSUB)[1]
dim(dfClean_MISC)[1] - dim(df_MISC_UNSUB)[1]

# Further Substract ---------------------------------------------------------------
df_ENT_UNSUB_MISC <- emailSubtract(df_ENT_UNSUB, df_MISC_UNSUB)
df_SMB_UNSUB_MISC <- emailSubtract(df_SMB_UNSUB, df_MISC_UNSUB)

dim(df_ENT_UNSUB)[1] - dim(df_ENT_UNSUB_MISC)[1]
dim(df_SMB_UNSUB)[1] - dim(df_SMB_UNSUB_MISC)[1]

# Save result -------------------------------------------------------------
dfToSave <- mget(c(dfRawNames, dfCleanNames,
                   c('df_ENT_UNSUB', 'df_SMB_UNSUB', 'df_MISC_UNSUB',
                     'df_ENT_UNSUB_MISC', 'df_SMB_UNSUB_MISC')))
dfToSaveSheetName <- c(paste0('Raw ',dfNameCore), paste0('Clean ',dfNameCore),
                       c('ENT remove UNSUB', 'SMB remove UNSUB', 'MISC remove UNSUB',
                         'ENT remove UNSUB and MISC', 'SMB remove UNSUB and MISC'))

# deal with JAVA issues
# # Error in .jcall("RJavaTools", "Ljava/lang/Object;", "invokeMethod", cl, :
# https://stackoverflow.com/questions/21937640/handling-java-lang-outofmemoryerror-when-writing-to-excel-from-r
# https://stackoverflow.com/questions/7019912/using-the-rjava-package-on-win7-64-bit-with-r
options(java.parameters = "-Xmx4g")
options(java.home="C:/Program Files/Java/jdk1.8.0_144/jre/bin/server")
library(rJava)
library(xlsx)

jgc <- function()
{
gc()
.jcall("java/lang/System", method = "gc")
}

if(!file.exists(FILE_OUT_RESULT)){
file.rename(FILE_OUT_RESULT, FILE_OUT_RESULT_BKP)
}

# map2(dfToSave, dfToSaveSheetName,
#      ~write.xlsx(.x, file=FILE_OUT_RESULT, sheetName=.y,
#                  col.names=TRUE, row.names=TRUE, append=TRUE, showNA=TRUE))

wb <- createWorkbook()
for(i in seq_along(dfToSave)){
jgc()
message("Creating sheet", i)
sheet <- createSheet(wb, sheetName = dfToSaveSheetName[[i]])
message("Adding data frame", i)
addDataFrame(dfToSave[[i]], sheet)
}
saveWorkbook(wb, FILE_OUT_RESULT)

Tuesday, September 5, 2017

Use Gmail filter to distribute Tableau dashboard subscription

To extend the users supported by Tableau servers (e.g. default 10 seat license)

Time sequence diagram

R code refactoring: load compressed matrix and retrieve corelation

Load sparse correlation matrix from file

# Original code
relevant_matrix <- (function() {
m <- matrix(0, 100, 100)
apply(read.csv("amatrixfile.csv"), 1,
function(x) { m[x[1], x[2]] <<- x[3] })
m
})()

# refactored
# Improvement:
# 1. Remove hard coded file name
# 2. Adapt to matrix dimension
# 3. Set diagonals as 1
# 4. Make code reuseable
loadRelavanceMatrix <- function(aFileName){
rel <- read_csv(aFileName)
entryDim <- max(c(rel$rawSrc, rel$rawDest))

ret <- matrix(0, entryDim, entryDim)
rel %>%
by_row(function(aRow) {ret[aRow[[1]],aRow[[2]] ]<<- aRow[[3]]})
# pmap version
#pmap(list(aRow = relevence_discipline[[1]],
# aCol = relevence_discipline[[2]],
# aRel = relevence_discipline[[3]]),
# function(aRow, aCol, aRel){
# relevant_disciplines_matrix2[aRow, aCol] <<- aRel
# })

diag(ret) <- 1

return(ret)
}

Retrieve correlation

# Original code
relevant_disciplines <- function(disciplines) {
if(length(disciplines)==0) { return(NULL) }
aggregate(
relevance ~ .,
rbind(data.frame(id=disciplines, relevance=1.0),
Reduce(rbind, lapply(disciplines, function(discipline_id) {
nonzero <- relevant_disciplines_matrix[discipline_id,] > 0
data.frame(id=which(nonzero),
relevance=relevant_disciplines_matrix[discipline_id, nonzero])
}))),
max)
}

# Refactored
relevant_disciplines

Friday, June 30, 2017

Extract first component of a string list: dplyr vs. purrr

Goal: Compare two styles of functional programming on parsing string.
Input: The cabin column in Titanic data (Kaggle)
Output: the number extracted from the first components of the space delimited string

Code:
#################################################
library(microbenchmark)
library(tidyverse)
library(stringr)

pc <- microbenchmark(
purrrV = {
trainDf0 <- trainDfRaw
trainDf0$Cabin %>%
as.character() %>%
map(str_split, " ") %>%
flatten() %>%
map_chr(1) %>%
map_chr(str_replace_all, '[[:alpha:]]','') %>%
map(as.integer) %>%
map(coalesce, as.integer(0)) -> trainDf0$Cabin_Number
},
dplyV = {
trainDf1 <- trainDfRaw
trainDf1 %>% mutate(Cabin_Number = as.character(Cabin)) %>%
separate(Cabin_Number, into = c("Cabin_Number"), sep = " ", extra = "drop", remove = TRUE) %>%
mutate(Cabin_Number = coalesce(as.integer(str_replace(Cabin_Number, "[[:alpha:]]", "")), as.integer(0))) ->
trainDf1
}
)

Result
Unit: milliseconds
expr min lq mean median uq max neval
purrrV 1363.32589 1405.10168 1607.33881 1429.10920 1561.6362 3453.50236 100
dplyV 15.69765 16.69651 19.83311 17.46755 19.8518 79.28219 100

dply is much faster than purrr

Sunday, June 11, 2017

Customize Legend in Tableau

Tableau default legend sometimes is too big to fit in the real estate. Thus, we need to use other technique to display a legend. Other use cases include dynamic legend (metric switch or hide when no chart needed), or with extra info (e.g. use bar chart).

Here are some examples:

Name	Solution	Picture	Solution
Bar chart	Use bar chart to provide more info
Rotated	Rotation to save space
Embedded	Put label inside colored shape

Download Tableau workbook (v10.2)

Sunday, January 29, 2017

Add measure to column header

We can add numeric info to the column header to introduce the second measure, or other dimension info as showed in picture below (download workbook).

Steps:
1. Create calculated filed in which we can use STR function to convert number to string, use ATTR to include original dimension break, and use CHAR(10) to insert line break, e.g. ATTR([Segment])+ CHAR(10) + str([Segment Size]).
2. Use two dimension as column header. The first is the customized string in step 1, and the second is the original dimension.
3. Uncheck the Show Header on the original dimension.

3 methods to draw wave/break bar chart

When we design bar chart with very different scales along columns, wave/break line can help to call out the scale difference, like this Excel bar chart (original post).

Here I have three methods to draw wave/break bar chart (download workbook):