Part 1: How not to lie with choropleth maps

Grab Census Data (review from Lab 2)

  • Using tidycensus, load in variables on median household income and median housing value from the acs5 2013-2017.
library(tidycensus)
library(tidyverse)
library(viridis)
library(plyr)
library(gtools)
census_api_key("Your Key here")
Var <- c("B19013_001","B25077_001")
## c(Median household Income, Median Housing Value)

CenDF <- get_acs(geography = "county",
                   variables = Var,
                   year = 2017,
                   survey = "acs5",
                   geometry = TRUE,
                   shift_geo = TRUE) 

Manipulate Data

  • Create new variable for the housing price to income ratio.
CenDF <-
  CenDF %>% 
    mutate(variable=case_when( 
      variable=="B19013_001" ~ "HHIncome",
      variable=="B25077_001" ~ "HouseValue")) %>%
    select(-moe) %>%  
    spread(variable, estimate) %>%  #Spread moves rows into columns
    mutate(HHInc_HousePrice_Ratio=round(HouseValue/HHIncome,2)) 

Map Data

  • Rely on ggplot and scale_fill_viridis to create natural breaks for color sequencing and to mapHouse-Price-to-Income Ratio
Plot1 <-  ggplot(CenDF) +
    geom_sf(aes(fill = HHInc_HousePrice_Ratio), color=NA) +
    coord_sf(datum=NA) +
    labs(title = "House-Price-to-Income Ratio \n R Built-in Color Cut",
         caption = "Source: ACS 5-year, 2013-2017",
         fill = "Price-Income Ratio") +
    scale_fill_viridis(direction=-1)
Plot1

Color Scales in R

R has a variety of functions that allow the user to create color scales easily. These functions generally require you to specify a color on each end of the spectrum and they will interpolate the values between based upon how many levels you desire.

Importance

We select colors to communicate information about our data. If we are using a continuous variable the most basic decision is whether we want to represent it as positive and negative deviations from the average (a divergent scale), or as a continuum of low to high values (a sequential scale). If we have categorical data, it is generally visualized through different colors representing each group (a qualitative scale).

Color Schemes

plot( 1:7, rep(1,7), ylim=c(-0.5,3.5), xlim=c(0,12), yaxt="n", xaxt="n", bty="n", xlab="", ylab=""  )

color.function <- colorRampPalette( c("gray80","darkred") )
col.ramp <- color.function( 7 ) # number of groups you desire
points( 1:7, rep(3,7), pch=15, cex=8, col=col.ramp )

color.function <- colorRampPalette( c("darkred","gray80","steelblue") )
col.ramp <- color.function( 7 ) # number of groups you desire
points( 1:7, rep(2,7), pch=15, cex=8, col=col.ramp )

color.function <- colorRampPalette( c("gray80","black") )
col.ramp <- color.function( 7 ) # number of groups you desire
points( 1:7, rep(1,7), pch=15, cex=8, col=col.ramp )

text( 8, 3, "Sequential", pos=4 )
text( 8, 2, "Divergent", pos=4 )
text( 8, 1, "Grayscale", pos=4 )

Alternative Map: 2 Color Groups

  • Transform HHInc_HousePrice_Ratio from continuous variable to factor variable with 2 levels
## Convert Integer into factor variable
CenDF$fill_factor <- quantcut(CenDF$HHInc_HousePrice_Ratio, q = seq(0, 1, by = 0.5))
CenDF$fill_factor = mapvalues(CenDF$fill_factor, from = levels(CenDF$fill_factor),
                        to = c("low","high"))


## Assign 2 colors
col.ramp <- viridis(n = 2) # number of groups you desire
Plot2<-ggplot(CenDF) +
  geom_sf(aes(fill = fill_factor), color=NA) +
  coord_sf(datum=NA) +
  labs(title = "House-Price-to-Income Ratio",
       caption = "Source: ACS 5-year, 2013-2017",
       fill = "Price-Income Ratio") +
  scale_fill_manual("Price-Income Ratio",values =  col.ramp) 
Plot2

Terciles - 3 Color Groups

  • Transform HHInc_HousePrice_Ratio from continuous variable to factor variable with 3 levels
## Convert Integer into factor variable
CenDF$fill_factor <- quantcut(CenDF$HHInc_HousePrice_Ratio, q = seq(0, 1, by = .33))
CenDF$fill_factor = mapvalues(CenDF$fill_factor, from = levels(CenDF$fill_factor),
                        to = c("low","med", "high"))


## Assign 3 colors
col.ramp <- viridis(n = 3) # number of groups you desire
Plot3<-ggplot(CenDF) +
  geom_sf(aes(fill = fill_factor), color=NA) +
  coord_sf(datum=NA) +
  labs(title = "House-Price-to-Income Ratio",
       caption = "Source: ACS 5-year, 2013-2017",
       fill = "Price-Income Ratio") +
  scale_fill_manual("Price-Income Ratio \n (Terciles)",values =  col.ramp) 
Plot3

Quintiles - 5 Color Groups

  • Transform HHInc_HousePrice_Ratio from continuous variable to factor variable with 5 levels
## Convert Integer into factor variable
CenDF$fill_factor <- quantcut(CenDF$HHInc_HousePrice_Ratio, q = seq(0, 1, by = .2))
CenDF$fill_factor = mapvalues(CenDF$fill_factor, from = levels(CenDF$fill_factor),
                        to = c("1","2","3", "4","5"))


## Assign 5 colors
col.ramp <- viridis(n = 5) # number of groups you desire
Plot4<-ggplot(CenDF) +
  geom_sf(aes(fill = fill_factor), color=NA) +
  coord_sf(datum=NA) +
  labs(title = "House-Price-to-Income Ratio",
       caption = "Source: ACS 5-year, 2013-2017",
       fill = "Price-Income Ratio") +
  scale_fill_manual("Price-Income Ratio  \n (Quintiles)",values =  col.ramp) 
Plot4

Unequal Intervals

  • Transform HHInc_HousePrice_Ratio from continuous variable to factor variable with 6 levels
## Convert Integer into factor variable
CenDF$fill_factor <- quantcut(CenDF$HHInc_HousePrice_Ratio, q = c(0,.1,.25,.5,.75,.9,1))
#CenDF$fill_factor = mapvalues(CenDF$fill_factor, from = levels(CenDF$fill_factor),
 #                       to = c("< .1",".1-.25",".25-.5", ".5-.75",".75-.9","> .9"))


## Assign 6 colors
col.ramp <- viridis(n = 6) # number of groups you desire
Plot5<-ggplot(CenDF) +
  geom_sf(aes(fill = fill_factor), color=NA) +
  coord_sf(datum=NA) +
  labs(title = "House-Price-to-Income Ratio",
       caption = "Source: ACS 5-year, 2013-2017",
       fill = "Price-Income Ratio") +
  scale_fill_manual("Price-Income Ratio",values =  col.ramp) 
Plot5

Compare Maps

library(gridExtra)
grid.arrange(Plot1,Plot3,Plot4,Plot5, nrow=2)

Example of Skew Correction

ggplot(CenDF) +
    geom_sf(aes(fill = HHInc_HousePrice_Ratio), color=NA) +
    coord_sf(datum=NA) +
    labs(title = "House-Price-to-Income Ratio",
         caption = "Source: ACS 5-year, 2013-2017",
         fill = "Price-Income Ratio") +
    scale_fill_viridis(direction=-1)

Revising the Scale for Skewed Data

Our price ratio is skewed because of the long right tail:

hist( CenDF$HHInc_HousePrice_Ratio, 
      breaks=50, col="darkgray" )

As a result, most of our data will be crammed into the bottom half of our scale, making the colors a lot harder to interpret.

We can consider ways to transform the scale before visualization.

Logging values can pull-in outliers:

hist( log( CenDF$HHInc_HousePrice_Ratio + 1 ), 
      breaks=50, col="darkgray" )

We can also top-code scales to remove the long tail:

# top-coding 
CenDF$inc.home.ratio <- CenDF$HHInc_HousePrice_Ratio
CenDF$inc.home.ratio[ CenDF$inc.home.ratio > 5 ] <- 5

hist( CenDF$inc.home.ratio, 
      breaks=50, col="darkgray" )

Let’s also select a divergent scale to better differentiate high and low values (a neutral color like gray represents average values).

The pals package has some crisp color scales.

### devtools::install_github("kwstat/pals")   
library( pals )   
pal.map( ocean.balance, n=13, main="Example Divergent Colorscale")

Let’s see how this changes our interpretation of the data:

## REGULAR DIVERGENT SCALE WHEN DATA IS SKEWED
ggplot(CenDF) +
    geom_sf(aes(fill = HHInc_HousePrice_Ratio), color=NA) +
    coord_sf( datum=NA ) +
    labs( title = "House-Price-to-Income Ratio",
          caption = "Source: ACS 5-year, 2013-2017",
          fill = "Price-Income Ratio" ) +
    scale_fill_gradientn( colours=ocean.balance(10), guide = "colourbar" )

## TOP-CODED SCALE - MAX IS RATIO OF 6

CenDF$inc.home.ratio <- CenDF$HHInc_HousePrice_Ratio
CenDF$inc.home.ratio[ CenDF$inc.home.ratio > 6 ] <- 6

ggplot(CenDF) +
    geom_sf(aes(fill = inc.home.ratio), color=NA) +
    coord_sf( datum=NA ) +
    labs( title = "House-Price-to-Income Ratio",
          caption = "Source: ACS 5-year, 2013-2017",
          fill = "Price-Income Ratio" ) +
    scale_fill_gradientn( colours=ocean.balance(10), guide = "colourbar" )

## LOGGED RATIO 

CenDF$log.price.ratio <- log( CenDF$HHInc_HousePrice_Ratio + 1 )

ggplot(CenDF) +
    geom_sf(aes(fill = log.price.ratio), color=NA) +
    coord_sf( datum=NA ) +
    labs( title = "House-Price-to-Income Ratio",
          caption = "Source: ACS 5-year, 2013-2017",
          fill = "Price-Income Ratio (log)" ) +
    scale_fill_gradientn( colours=ocean.balance(10), guide = "colourbar" )