MIS 315: Data Analytics I

Missing Data

I. Ozkan

Fall 2025

Missing Values

This presentation is based on Lecture 5 of ETC5510: Introduction to Data Analysis https://mida-monash.netlify.app/slides/lecture_5a.pdf

Missing Values

# create x with some missing values 
x <- c(6, NA, 2, NA, 7, NA, 5)

# is there NA in this vector?
is.na(x)
## [1] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE

# how many? 
# first can we use sum()
sum(1,2,3) # same as sum(c(1,2,3))
## [1] 6

sum(TRUE, FALSE, TRUE, TRUE)
## [1] 3

# TRUE is counted as 1 good 
# is.na(x): [1] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE 
sum(is.na(x)) # X has there 3 NAs  
## [1] 3

# proportion missed  
# number of missed/total number of values 
length(x) # valid for vector 
## [1] 7

# proportion
sum(is.na(x))/length(x)
## [1] 0.4285714

Working with Missing Values


3+4
## [1] 7

3 + NA
## [1] NA

# remember x 
x 
## [1]  6 NA  2 NA  7 NA  5

x+3 
## [1]  9 NA  5 NA 10 NA  8

sum(x)
## [1] NA

# but, let's see help page for sum() function 
# ?sum 
# Usage
# sum(..., na.rm = FALSE)

# please be careful with using the following functions 
sum(x, na.rm = TRUE)
## [1] 20

# remove na first then add the remaining numbers 
sum(na.omit(x))
## [1] 20

Be Careful when Removing NAs

x y z
NA 1 1
NA 2 2
NA 3 3
4 NA 4
5 NA 5
6 NA 6
7 7 NA
8 8 NA
9 9 NA

Be Careful when Removing NAs

# vis_miss() function is available in visdat package 
# install.packages("visdat")
# library(visdat) 
vis_miss(dat_df)

Be Careful when Removing NAs

na.omit(dat_df)
## [1] x y z
## <0 rows> (or 0-length row.names)

na.omit / na.rm

x y z
NA 1 1
NA 2 2
NA 3 3
4 NA 4
5 NA 5
6 NA 6
7 7 NA
8 8 NA
9 9 NA

na.omit / na.rm

x y z
NA 1 1
NA 2 2
NA 3 3
4 NA 4
5 NA 5
6 NA 6
7 7 NA
8 8 NA
9 9 NA

na.omit / na.rm

x y z
NA 1 1
NA 2 2
NA 3 3
4 NA 4
5 NA 5
6 NA 6
7 7 NA
8 8 NA
9 9 NA

na.omit / na.rm

x y z
NA 1 1
NA 2 2
NA 3 3
4 NA 4
5 NA 5
6 NA 6
7 7 NA
8 8 NA
9 9 NA

na.omit / na.rm

Removing NAs may Introduce bias

temp location
27 inside
26 inside
NA outside
29 inside
NA outside
20 outside
21 outside
24 inside

Missingness Summaries

Basic summaries of missingness:

Dataframe summaries of missingness:

Note: These functions work with group_by

Missingness Summaries

# airquality data (from datasets)
# check with ?airquality
# Daily air quality measurements in New York, May to September 1973
# A data frame with 153 observations on 6 variables

# total number of NAs
sum(is.na(airquality)) 
## [1] 44

n_miss(airquality) # same as above 
## [1] 44

# total number without NAs
sum(!is.na(airquality))
## [1] 874

n_complete(airquality) # same as above 
## [1] 874

Missingness Summaries


# summary for each columns  
miss_var_summary(airquality)
## # A tibble: 6 × 3
##   variable n_miss pct_miss
##   <chr>     <int>    <num>
## 1 Ozone        37    24.2 
## 2 Solar.R       7     4.58
## 3 Wind          0     0   
## 4 Temp          0     0   
## 5 Month         0     0   
## 6 Day           0     0

Missingness Summaries


# summary for each case 
miss_case_summary(airquality)   
## # A tibble: 153 × 3
##     case n_miss pct_miss
##    <int>  <int>    <dbl>
##  1     5      2     33.3
##  2    27      2     33.3
##  3     6      1     16.7
##  4    10      1     16.7
##  5    11      1     16.7
##  6    25      1     16.7
##  7    26      1     16.7
##  8    32      1     16.7
##  9    33      1     16.7
## 10    34      1     16.7
## # ℹ 143 more rows

Missingness Summaries


# summary table 
miss_case_table(airquality)   
## # A tibble: 3 × 3
##   n_miss_in_case n_cases pct_cases
##            <int>   <int>     <dbl>
## 1              0     111     72.5 
## 2              1      40     26.1 
## 3              2       2      1.31

Missingness Summaries

#  group_by example 
airquality %>%
  group_by(Month) %>%
  miss_case_table()
## # A tibble: 11 × 4
## # Groups:   Month [5]
##    Month n_miss_in_case n_cases pct_cases
##    <int>          <int>   <int>     <dbl>
##  1     5              0      24     77.4 
##  2     5              1       5     16.1 
##  3     5              2       2      6.45
##  4     6              0       9     30   
##  5     6              1      21     70   
##  6     7              0      26     83.9 
##  7     7              1       5     16.1 
##  8     8              0      23     74.2 
##  9     8              1       8     25.8 
## 10     9              0      29     96.7 
## 11     9              1       1      3.33

Missingness Visualization

vis_miss(airquality)

Missingness Visualization: Arrange rows by missingness

vis_miss(airquality, cluster = T)

Missingness Visualization: For each month

vis_miss(airquality, facet = Month)

Missingness Visualization: For each Columns

gg_miss_var(airquality)

Missingness Visualization: For each Columns

gg_miss_var(airquality, facet = Month)

Missingness Visualization: Patterns

gg_miss_upset(airquality)

How to Work with Missing Values

Common ways to impute values

Impute Example: Median

airquality %>%  
  as.data.frame() %>% 
  mutate(Ozone_NA = is.na(Ozone)) %>% 
  simputation::impute_median(Ozone ~ Month) %>% #<<
  ggplot(aes(x = Solar.R,
            y = Ozone,
            colour = Ozone_NA)) + 
  geom_point() + 
  theme_minimal()

Impute Example: Mean

airquality %>%  
  as.data.frame() %>% 
  mutate(Ozone_NA = is.na(Ozone)) %>% 
  simputation::impute_proxy(Ozone ~ mean(Ozone, na.rm=T)|Month) %>%
  ggplot(aes(x = Solar.R,
            y = Ozone,
            colour = Ozone_NA)) + 
  geom_point() + 
  theme_minimal()

Impute Example: Linear Model (will be discussed)

airquality %>%  
  as.data.frame() %>% 
  mutate(Ozone_NA = is.na(Ozone)) %>% 
  simputation::impute_lm(Ozone ~ Wind + Temp + Solar.R) %>% #<<
  ggplot(aes(x = Solar.R,
            y = Ozone,
            colour = Ozone_NA)) + 
  geom_point() + 
  theme_minimal()

Impute Example: Decision Tree (will be discussed)

airquality %>%  
  as.data.frame() %>% 
  mutate(Ozone_NA = is.na(Ozone)) %>% 
  simputation::impute_cart(Ozone ~ Wind + Temp + Solar.R) %>% #<<
  ggplot(aes(x = Solar.R,
            y = Ozone,
            colour = Ozone_NA)) + 
  geom_point() + 
  theme_minimal()

Impute Example: KNN (will be discussed)

airquality %>%  
  as.data.frame() %>% 
  mutate(Ozone_NA = is.na(Ozone)) %>% 
  simputation::impute_knn(Ozone ~ Wind + Temp + Solar.R) %>% #<<
  ggplot(aes(x = Solar.R,
            y = Ozone,
            colour = Ozone_NA)) + 
  geom_point() + 
  theme_minimal()