My Missing Data Lab
  • EM Algorithm
  • Univariate Lab
  • Source
Distribution 1
Distribution 2
Mixing & EM option



Result

              
3D Visualization

Coming Soon

Shiny 3-file structure

ui.R server.R and global.R

ui.R
library(shiny)
library(shinymaterial)
library(shinydashboard)
library(plotly)
library(DT)

# flow: input -> box group -> tab -> full ui


# global var --------------------------------------------------------------


methodList <- list("Predictive mean matching" = "pmm",
                   "Weighted predictive mean matching" = "midastouch",
                   "Random sample from observed values" = "sample",
                   "Classification and regression trees" = "cart",
                   "Random forest imputations" = "rf",
                   "Unconditional mean imputation" = "mean",
                   "Bayesian linear regression" = "norm",
                   "Linear regression ignoring model error" = "norm.nob",
                   "Linear regression using bootstrap" = "norm.boot",
                   "predicted values" = "norm.predict",
                   "Imputation of quadratic terms" = "quadratic",
                   "Random indicator for nonignorable data" = "ri",
                   "Level-1 normal heteroskedastic" = "2l.norm",
                   "Level-1 normal homoscedastic, lmer" = "2l.lmer",
                   "Level-1 normal homoscedastic, pan" = "2l.pan",
                   "Level-2 class mean" = "2lonly.mean",
                   "Level-2 class normal" = "2lonly.norm",
                   "Level-2 class predictive mean matching" = "2lonly.pmm")



# inputs lab --------------------------------------------------------------


inMissType <- material_radio_button(
  input_id = "missType",
  label = "Missingness Mechanism:", 
  choices = c("MCAR", "MAR", "MNAR1", "MNAR2")
)

inMissPercent <- material_slider(
  input_id = "missPercent", 
  "% missing data",
  min_value = 1, max_value = 99, initial_value = 20
)

inImputeM <- material_dropdown(
  input_id = "imputeMethod",
  label = "Imputation Mechanism:", 
  choices = as.character(methodList)
)

inImputeMulti <- material_number_box(
  input_id = "imputeN",
  label = "# multi-imuptation",
  min_value = 2, max_value = 30, initial_value = 10
)

runImputation <- material_button(
  input_id = "runImp",
  label = "Run Imputation"
)

inShowGroup <- material_switch(
  input_id = "showGroup",
  #label = " group by age",
  initial_value = TRUE
)

inPlotItems <- material_dropdown(
  input_id = "plotItems", 
  label = "Plot Contents:",
  multiple = TRUE,
  choices = c("")
)



# inputs em ---------------------------------------------------------------


inMu1 <- material_row(
  material_column(width = 4, material_number_box("mean1x", "Mean x", initial_value = 3, min_value = -30, max_value = 30)),
  material_column(width = 4, material_number_box("mean1y", "y", initial_value = 3, min_value = -30, max_value = 30)),
  material_column(width = 4, material_number_box("mean1z", "z", initial_value = 3, min_value = -30, max_value = 30))
)


inSigma1 <- material_row(
  material_row(
    material_column(width = 4, numericInput("sigma1.11", "Covariance", value = 1, step = 0.5))
  ),
  material_row(
    material_column(width = 4, numericInput("sigma1.21", "", value = 0, step = 0.5)),
    material_column(width = 4, numericInput("sigma1.22", "", value = 1, step = 0.5))
  ),
  material_row(
    material_column(width = 4, numericInput("sigma1.31", "", value = 0, step = 0.5)),
    material_column(width = 4, numericInput("sigma1.32", "", value = 0, step = 0.5)),
    material_column(width = 4, numericInput("sigma1.33", "", value = 1, step = 0.5))
  )
) 

inMu2 <- material_row(
  material_column(width = 4, material_number_box("mean2x", "Mean x", initial_value = -1, min_value = -30, max_value = 30)),
  material_column(width = 4, material_number_box("mean2y", "y", initial_value = -1, min_value = -30, max_value = 30)),
  material_column(width = 4, material_number_box("mean2z", "z", initial_value = -1, min_value = -30, max_value = 30))
)

inSigma2 <- material_row(
  material_row(
    material_column(width = 4, numericInput("sigma2.11", "Covariance", value = 1, step = 0.5))
  ),
  material_row(
    material_column(width = 4, numericInput("sigma2.21", "", value = 0, step = 0.5)),
    material_column(width = 4, numericInput("sigma2.22", "", value = 1, step = 0.5))
  ),
  material_row(
    material_column(width = 4, numericInput("sigma2.31", "", value = 0, step = 0.5)),
    material_column(width = 4, numericInput("sigma2.32", "", value = 0, step = 0.5)),
    material_column(width = 4, numericInput("sigma2.33", "", value = 1, step = 0.5))
  )
)

inMix <- tagList(
  material_slider(
    input_id = "lambda",
    label = "Lambda - mix probability (100x)",
    initial_value = 60,
    min_value = 1, max_value = 99
  ),
  material_slider(
    input_id = "sizeN",
    label = "Data length (n)",
    initial_value = 100,
    min_value = 50, max_value = 500
  )
)

inEMextra <- material_row(
  tags$br(),
  tags$br(),
  material_column(width = 6, material_number_box(
    input_id = "maxiter",
    label = "Max Iterations",
    initial_value = 500,
    min_value = 100, max_value = 2000
  )),
  material_column(width = 6, material_number_box(
    input_id = "initMag",
    label = "InitValue Error",
    initial_value = 3,
    min_value = 1, max_value = 10
  ))
)

runEMbtn <- material_row(
  material_column(width = 6, material_button("genSim", label= "Generate Data")),
  material_column(width = 6, material_button("runEM", label = "Run EM"))
)


# modules in uni ----------------------------------------------------------

boxConfig <- material_row(
  title = "Configurations: Missingness & Imputation",
  # missingness inputs
  material_column(width = 4, material_card( 
      inMissType, 
      inMissPercent,
      p(tags$i("Data will be generated automatically"))
  )),
  # imputation inputs
  material_column(width = 4, material_card( 
      inImputeM, 
      inImputeMulti, 
      runImputation
  )),
  # graph options
  material_column(width = 4, material_card( 
      inPlotItems, 
      p(tags$i("only available after imputation | V1,...,V9,... stand for each imputation result from the multi-imputation")),
      inShowGroup
  ))
)

boxValues <- material_row(
  valueBoxOutput(width = 2, "valueBoxA"),
  valueBoxOutput(width = 2, "valueBoxB"),
  valueBoxOutput(width = 4, "valueBoxC"),
  valueBoxOutput(width = 4, "valueBoxD")
)

boxImpRes <- material_row(
  material_column(width = 5, material_card(
    title = "Result",
    dataTableOutput("missTable"),
    plotOutput("missPlot")
  )),
  material_column(width = 7, material_card(
    title = "Visualization",
    plotlyOutput("missMain")
  ))
)


# modules in EM -----------------------------------------------------------

boxEMconfig <- material_row(
  title = "Simulation Configuration",
  # distribution 1
  material_column(width = 4, material_card(title = "Distribution 1", inMu1, inSigma1)),
  # distribution 2
  material_column(width = 4, material_card(title = "Distribution 2", inMu2, inSigma2)),
  # options and run
  material_column(width = 4, material_card(title = "Mixing & EM option", inMix, inEMextra), runEMbtn)
)

boxEMRes <- material_row(
  material_column(width = 4, material_card(title = "Result", 
                                           htmlOutput("emInfo"), 
                                           verbatimTextOutput("emPrint"))),
  material_column(width = 8, material_card(title = "3D Visualization",
                                           plotlyOutput("plot3d", height = "500px")))
)


# modules in source -------------------------------------------------------

boxSource0 <- material_card(
  title = "Shiny 3-file structure",
  p("ui.R    server.R    and    global.R")
)

boxSource1 <- material_card(
  title = "ui.R" ,
  tags$pre(includeText("ui.R"))
)

boxSource2 <- material_card(
  title = "server.R",
  tags$pre(includeText("server.R"))
)

boxSource3 <- material_card(
  title = "global.R",
  tags$pre(includeText("global.R"))
)

# MAKE the UI -------------------------------------------------------------


material_page(
  title = "My Missing Data Lab",
  nav_bar_color = "cyan lighten-2",
  material_tabs(
    tabs = c("EM Algorithm" = "em", 
             "Univariate Lab" = "lab",
             "Source" = "source")
  ),
  material_tab_content(
    tab_id = "em",
    boxEMconfig,
    boxEMRes
  ),
  material_tab_content(
    tab_id = "lab",
    tags$h1("Coming Soon")
#    tags$p("I am currently working on a dynamic link to ESS (European Social Survey), and will be able to publish result here")
#     boxConfig, boxValues, boxImpRes
  ),
  material_tab_content(
    tab_id = "source",
    tags$div(
      class = "container",
      boxSource0,
      boxSource1,
      boxSource2,
      boxSource3
    )
  )
)










server.R
library(shinydashboard)
library(shinyWidgets)
library(plotly)
library(purrr)
library(matrixcalc)
library(DT)
library(dplyr)

# global var --------------------------------------------------------------

myColors<- setNames(c('#66c2a5','#fc8d62','#8da0cb','#e78ac3','#a6d854','#ffd92f'), 
                    c('Distribution 1', 'Distribution 2', 'Theoretic mu 1',
                      'Theoretic mu 2', 'Trace of mu 1', 'Trace of mu 2'))
orgDF <- readRDS("sampleData.rds"); genDF <- NULL; mainDF <- NULL;
sim <- NULL; init <- NULL; p3d <- NULL


shinyServer(function(input, output, session) {

# generate missing data ---------------------------------------------------

  
  observeEvent({input$missType; input$missPercent},{
    genList <- createData(orgDF, input$missType, input$missPercent / 100)
    genDF <<- genList$generatedData
    
    output$valueBoxA <- renderValueBox({
      valueBox( nrow(genDF), "total observations",
                icon = icon("bars"), color = "red")
    })
    output$valueBoxB <- renderValueBox({
      valueBox( sum(!is.na(genDF$Total)), "complete cases",
                icon = icon("info-circle"), color = "yellow")
    })
    output$valueBoxC <- renderValueBox({
      valueBox( paste(round(mean(genDF$Total, na.rm = T), digits = 2), 
                      round(mean(orgDF$Total), digits = 2), sep = " / "),
                "mean: CC vs. Truth",
                icon = icon("dashboard"), color = "aqua")
    })    
    output$valueBoxD <- renderValueBox({
      valueBox( paste(round(sd(genDF$Total, na.rm = T), digits = 2), 
                      round(sd(orgDF$Total), digits = 2), sep = " / "),
                "sd: CC vs. Truth",
                icon = icon("calculator"), color = "blue")
    })    
  })


# imputation --------------------------------------------------------------
  
  observeEvent(input$runImputation,{
    impList <- imputointi(genDF, input$imputeMethod, input$imputeN)

    output$missPlot <- renderPlot(impList$impPlot)
  
    resultDT <- datatable(round(tableIMP(impList, orgDF), digits = 3),
                          options = list(dom = "t", rownames = FALSE))
    output$missTable <- renderDataTable(resultDT)
    
    # set a global var for plot
    impDF <- impList$impDATA
    missId <- which(is.na(genDF$Total))
    plotDF <- as.data.frame(matrix(nrow = nrow(orgDF), ncol = ncol(impDF)))
    plotDF[missId,1:ncol(impDF)] <- impDF
    plotDF$age <- genDF$Age
    plotDF$days <- genDF$Days
    plotDF$ObsTotal <- genDF$Total
    plotDF$real <- NA
    plotDF$real[missId] <- orgDF$Total[missId]
    mainDF <<- tidyr::gather(plotDF, "key", "total", -age, -days)
    
    updatePickerInput(session, "plotItems", choices = unique(mainDF$key), selected = c("ObsTotal", "real","V1"))
  })
  
  observeEvent({input$showGroup; input$plotItems},{
    if(!is.null(mainDF)){
       plotData <- mainDF %>% filter(key %in% input$plotItems)
       p <- ggplot(plotData, aes(days, total, color = key)) + geom_point()
       print(head(plotData)); print(input$plotItems)
       if(input$showGroup) p <- p + facet_wrap(~age)
       
       output$missMain <- renderPlotly((p))
    }
  })
  

# EM: generate samples ----------------------------------------------------
  
  output$emInfo <- renderUI(p('Note: The initial value is generated automatically by including an error defined by InitValue Error'))

  observeEvent(input$genSim,{
    sigma1 <- matrix(c(input$sigma1.11, input$sigma1.21, input$sigma1.31,
                       input$sigma1.21, input$sigma1.22, input$sigma1.32,
                       input$sigma1.31, input$sigma1.32, input$sigma1.33),
                       ncol = 3, nrow = 3)
    sigma2 <- matrix(c(input$sigma2.11, input$sigma2.21, input$sigma2.31,
                       input$sigma2.21, input$sigma2.22, input$sigma2.32,
                       input$sigma2.31, input$sigma2.32, input$sigma2.33),
                     ncol = 3, nrow = 3)
    mu1 <- c(input$mean1x, input$mean1y, input$mean1z)
    mu2 <- c(input$mean2x, input$mean2y, input$mean2z)
    
    if(is.positive.semi.definite(sigma1) & is.positive.semi.definite(sigma2)){
      
      sim <<- genData(input$sizeN, mu1, mu2, sigma1, sigma2, input$lambda/100)
      
      init <<- randomInit(mu1, mu2, sigma1, sigma2, input$lambda/100, input$initMag)
      print(input$lambda)
      
      p3d <<- plot_ly(sim$df) %>% 
        add_markers(x = ~X1, y= ~X2, z= ~X3, color = ~isFirst, colors = myColors,
                    marker = list(opacity = 0.6, symbol = "circle-open-dot", size = 3)) %>%
        add_markers(x = mu1[1], y = mu1[2], z = mu1[3], color = "Theoretic mu 1",
                    marker = list(opacity =1, symbol = "diamond", size = 5))%>%
        add_markers(x = mu2[1], y = mu2[2], z = mu2[3], color = "Theoretic mu 2",
                    marker = list( symbol = "diamond", size = 5))

      output$plot3d <- renderPlotly(p3d)
      
      output$emPrint <- renderPrint({
        print("Samples Generated and Plotted")
        print(list(mu1 = mu1, mu2 = mu2, sigma1 = sigma1, sigma2 = sigma2))
      })
      
    } else {
      output$emInfo <- renderUI(tagList(tags$b("ERROR: "), p("please change covariance | must be positively semi-definite")))
      output$emPrint <- renderPrint({
        print("covariance matrix error")
        print("nothing updated")
      })
    }
    
  })

# EM: do the algorithm ----------------------------------------------------

  observeEvent(input$runEM, {
    
    if(!is.null(sim)){
      em <- myEM(init, sim$data, maxiter = input$maxiter)
      
      tr.mu11 <- em$all %>% map(1) %>% map_dbl(1)
      tr.mu12 <- em$all %>% map(1) %>% map_dbl(2)
      tr.mu13 <- em$all %>% map(1) %>% map_dbl(3)
      tr.mu21 <- em$all %>% map(2) %>% map_dbl(1)
      tr.mu22 <- em$all %>% map(2) %>% map_dbl(2)
      tr.mu23 <- em$all %>% map(2) %>% map_dbl(3)
      
      p3d <<- p3d %>% 
        add_markers(x = tr.mu11, y = tr.mu12, z = tr.mu13, color = "Trace of mu 1",
                    marker = list( symbol = "cross", size = 5)) %>%
        add_markers(x = tr.mu21, y = tr.mu22, z = tr.mu23, color = "Trace of mu 2",
                    marker = list( symbol = "cross", size = 5))
      
      output$emInfo <- renderUI(tagList(
        p("EM result after", length(tr.mu11), "iterations", br(),
          "Covergence reached?", !em$reachMax, br(),
          "convergence is set to 1e-12")
      ))
      
      output$emPrint <- renderPrint({
        print(em$result)
        print("=========")
        print("initial values are")
        print(init)
      })
      
      output$plot3d <- renderPlotly(p3d)
    } else {
      output$emInfo <- renderUI(tagList(tags$b("ERROR: "), p("please click Generate Data first")))
      output$emPrint <- renderPrint({
        print("no data available")
      })
    }
    
    
  })
})
global.R
library(MASS)
library(mice)


# impute ------------------------------------------------------------------

createData <- function(data, missingnessType, amount){
  if(missingnessType == "MCAR"){
    a <- 1:length(data$Age)
    generatedData <- data
    #This missingness index can be used to identify those values that go missing
    missingnessIndex <- sample(a,round(length(data$Age)*amount))
    generatedData[missingnessIndex,2] <- NA
    
  }
  if(missingnessType=="MAR"){
    #data <- arrange(data,data$Age)
    data <- data[order(data$Age),]
    data1234 <-data[data$Age==1 |data$Age==2 |data$Age==3 |data$Age==4,]
    data56 <- data[data$Age==5 | data$Age==6,]
    
    #rownames(data56) <- seq(length(data56$Age))
    #rownames(data1234) <- seq(length(data1234$Age))
    a1 <- 1:length(data56$Age)
    a2 <- 1:length(data1234$Age)
    missingnessIndex1234 <- sample(a2,round(length(data1234$Age)*amount))
    if (amount + 0.2 >= 1) amount=0.8
    missingnessIndex56 <- sample(a1,round(length(data56$Age)*(amount+0.2)))
    data56[missingnessIndex56,2] <- NA
    data1234[missingnessIndex1234,2] <- NA
    generatedData <- data.frame(rbind(data1234,data56))
    missingnessIndex <- c(missingnessIndex1234,missingnessIndex56+length(data1234$Age))
  }
  if(missingnessType=="MNAR1"){
    data <- data[order(data$Total),]
    #data <- arrange(data,data$Total)
    
    data0 <-data[data$Total < 120,]
    data120 <- data[data$Total>=120,]
    
    #rownames(data120) <- seq(length(data120$Age))
    #rownames(data0) <- seq(length(data0$Age))
    a1 <- 1:length(data120$Age)
    a2 <- 1:length(data0$Age)
    missingnessIndex0 <- sample(a2,round(length(data0$Age)*amount))
    if (amount + 0.2 >= 1) amount=0.8
    missingnessIndex120 <- sample(a1,round(length(data120$Age)*(amount+0.2)))
    data120[missingnessIndex120,2] <- NA
    data0[missingnessIndex0,2] <- NA
    generatedData <- data.frame(rbind(data0,data120))
    missingnessIndex <- c(missingnessIndex0,missingnessIndex120+length(data0$Age))
  }
  if(missingnessType=="MNAR2") {
    data <- data[order(data$Total),]
    #data <- arrange(data,data$Total)
    
    index <- data$Days>0
    data2 <- data[index,]
    dataLarge<- data2[(data2$Total/data2$Days)>=5,]
    dataSmall1 <-data2[data2$Total/data2$Days < 5,]
    dataSmall2 <- data[index==FALSE,]
    dataSmall <- data.frame(rbind(dataSmall1,dataSmall2))
    
    a1 <- 1:length(dataLarge$Age)
    a2 <- 1:length(dataSmall$Age)
    missingnessIndexSmall <- sample(a2,round(length(dataSmall$Age)*amount))
    if (amount + 0.2 >= 1) amount=0.8
    missingnessIndexLarge <- sample(a1,round(length(dataLarge$Age)*(amount+0.2)))
    dataLarge[missingnessIndexLarge,2] <- NA
    dataSmall[missingnessIndexSmall,2] <- NA
    generatedData <- data.frame(rbind(dataSmall,dataLarge))
    missingnessIndex <- c(missingnessIndexSmall,missingnessIndexLarge+length(dataSmall$Age))
    
  }
  # First is index telling what rows are missing from the original data.
  # Then returns the new data with missingness.
  return(list(missingnessIndex = missingnessIndex, generatedData = generatedData))
}

# Imputation itself. Gets data which has missing values and information
# about what method to use in "mice" function. 
imputointi <- function(generatedData, method, m) {
  imp1 <- mice(generatedData[,1:3],m=m,  meth=method, print=F)
  impPlot <- xyplot(imp1,Total~Days|Age)
  
  #summary(imp1)
  fish1 <- with(imp1,lm(Total~-1+as.factor(Age)))
  fish1_pooled <- summary(pool(fish1))
  #print(fish1_pooled)
  impPARAM<-(fish1_pooled[1:6,1:2])
  impDATA <- imp1$imp$Total
  # Returns first table of estimated values and mean errors, then
  # the imputated values (20 columns since we have 20 imputations)
  # and lastly a plot about imputations. This plot may or may not be
  # used.
  return(list(impPARAM = impPARAM, impDATA = impDATA, impPlot = impPlot))
}


# Returns a table with imputated data mean, mean error and real data mean
tableIMP <- function(impu, aineisto) {
  impTable <- (impu[[1]])
  aineisto <- as.data.frame(aineisto)
  realMean <- NULL
  for (i in 1:6) {
    realMean[i] <-  mean(aineisto[aineisto$Age==i,2])
  }
  impTable <- data.frame(impTable,realMean)
  rownames(impTable) <- paste("Age Group", 1:6)
  impTable
}


# EM ----------------------------------------------------------------------

genData <- function(n, mu1, mu2, sigma1, sigma2, lambda) {
  xa <- mvrnorm(n, mu = mu1, Sigma = sigma1)
  xb <- mvrnorm(n, mu = mu2, Sigma = sigma2)
  p <- rbinom(n, 1, lambda)
  x <- xa
  x[which(p==0),] <- xb [which(p==0),]
  
  # construct data.frame
  isFirst <- rep("Distribution 1",n)
  isFirst[which(p==0)] <- "Distribution 2"
  df <- data.frame(x, isFirst)
  
  return(list(data = x, id = p, df = df))
}

myEM <- function(init, x, maxiter = 1000, eps = 1e-12) {
  library(mvtnorm)
  theta <- list()
  theta[[1]] <- init
  
  for (i in 2:maxiter) {
    theta.p<- theta[[i-1]] # previous theta
    R1 <- numeric(nrow(x))
    R2 <- numeric(nrow(x))
    # use log for accuracy
    for (j in 1:nrow(x)) {
      lf1 <-  dmvnorm((x[j,]),mean = theta.p$mu1, sigma = theta.p$sigma1)
      lf2 <-  dmvnorm((x[j,]),mean = theta.p$mu2, sigma = theta.p$sigma2)
      Ra <- theta.p$lambda * lf1
      Rb <- (1 - theta.p$lambda) * lf2
      R1[j] <- Ra / (Ra + Rb)
      R2[j] <- 1- R1[j]
    }
    
    lambda <- mean(R1)
    # point-wise multiplication
    mu1 <- colSums(R1 * x) / sum(R1)
    mu2 <- colSums(R2 * x) / sum(R2)
    # calculate Rt(xi - mu1)(xi - mu1) !new mu
    rxx1 <- R1[j] * (x[j,] - mu1) %*% t(x[j,] - mu1)
    rxx2 <- R2[j] * (x[j,] - mu2) %*% t(x[j,] - mu2)
    for (j in 2: nrow(x)) {
      rxx1 <- rxx1 + R1[j] * (x[j,] - mu1) %*% t(x[j,] - mu1)
      rxx2 <- rxx2 + R2[j] * (x[j,] - mu2) %*% t(x[j,] - mu2)
    }
    
    theta[[i]] <- list(mu1 = mu1,
                       mu2 = mu2,
                       sigma1 = rxx1 / sum(R1),
                       sigma2 = rxx2 / sum(R2),
                       lambda = lambda)
    # return if close enough
    eDist <- dist(rbind(mu1, theta.p$mu1)) + dist(rbind(mu2, theta.p$mu2))
    if( eDist < 2 * eps)
      return(list(all = theta, result = theta[[i]], reachMax = F))
  }
  return(list(all = theta, result = theta[[maxiter]], reachMax = TRUE))
}

randomInit <- function(mu1, mu2, sigma1, sigma2, lambda, magnitude) {
  p <- length(mu1)
  diag10 <- magnitude * diag(nrow = p)
  mu1.x <- mu1 + mvrnorm(1, rep(0,p), diag10)
  mu2.x <- mu1 + mvrnorm(1, rep(0,p), diag10)
  lambda.x <- runif(1)
  
  return(list(mu1 = mu1.x, 
              mu2 = mu2.x,
              sigma1 = diag10, 
              sigma2 = diag10,
              lambda = lambda.x))
}