Descriptive Analysis of Community Change

April 19, 2021

Introduction

This is a tutorial that shows you some common preprocessing steps you will need to do in this week’s lab.

Please use the import::here function

This tutorial does not use import::here() but your submission for week 03 requires its usage. The code is kept here in order to help you understand why certain decisions were made.

If you need to see how to use the import::here() function, please see the “Project Data Steps” tutorial.

library( dplyr )
library( here )
library( knitr )
library( pander )
library( stargazer )
library( scales )

# set stargazer type to text for 
# previewing in RMD docs but
# convert to type HTML when knitting
# (next code chunk)

s.type <- "text"  
###################################
#
#     STARGAZER SETTINGS
#
###################################

# DO NOT RUN CHUNK UNLESS KNITTING:
# changes table formats to html
# before rendering RMD docs

s.type <- "html"

For this part of the project you will:

  • Calculate change in the MHV variable between 2000 and 2010.
  • Measure gentrification that occurs between 2000 and 2010.
  • Pick a city and create a new dorling cartogram to visualize your data.
  • Prepare descriptive statistics and chloropleth maps describing the MHV change variable and gentrification.

This tutorial demonstrates some of the initial steps of variable creation and data exploration for the project.

Data

See the data steps for the wrangling that occurs during the process of creating our rodeo datasets.

Reminder: stop using static file paths and start using there here::here() function

I cannot stress this enough: all data files must be imported using relative paths. You cannot use a static file (i.e. /Users/myname/some/file/path) because it breaks portability and therefore makes it impossible for anyone to reproduce your analysis.

Reason being is that no one else can rerun your file because that static file path belongs to you and only you.For a refresher on why you cannot use static file paths and need to use the here::here() function, please see Week 02’s lecture videos.

Before moving on, be sure to run the Data Steps tutorial

The following chunks depend on you having clean data in your data/rodeo folder. These files are generated by following along the Data Steps Tutorial tutorial.

To re-run the tutorial locally, follow these steps:

  1. Download a local copy of the labs/utilities.R file;
  2. Save the file as labs/wk03/utilities.R.
  3. Download a local copy the labs/project_data_steps.R file;
  4. Save the file as labs/wk03/project_data_steps.R.
  5. Select all the lines within the labs/wk03/project_data_steps.R and run them all to produce all of the clean data files within data/rodeo.
# note: please do not use static file paths
# note: notice down below the use of here::here()
d1 <- readRDS( here::here( "data/rodeo/LTDB-2000.rds" ) )
d2 <- readRDS( here::here( "data/rodeo/LTDB-2010.rds" ) )
md <- readRDS( here::here( "data/rodeo/LTDB-META-DATA.rds" ) )

# check to make sure we are not losing 
# or gaining observations in the merge
nrow( d1 ) 
## [1] 72693
d1 <- select( d1, - year )
d2 <- select( d2, - year )

d <- merge( d1, d2, by="tractid" )
d <- merge( d, md, by="tractid" )

nrow( d )
## [1] 72693

Filter Rural Districts

table( d$urban )
## 
## rural urban 
## 12971 59722
d <- filter( d, urban == "urban" )

Identify Common Variables

We can create a function to compare variables from the 2000 and 2010 datasets:

# find variables that are in both files
compare_dfs <- function( df1, df2 )
{
  # use regular expressions to remove numeric suffixes 
  var.names.1 <- names( df1 )
  var.names.1 <- gsub( "[.][xy]$", "", var.names.1 )
  var.names.1 <- gsub( "[0-9]{2}$", "", var.names.1 )
  
  var.names.2 <- names( df2 )
  var.names.2 <- gsub( "[.][xy]$", "", var.names.2 )
  var.names.2 <- gsub( "[0-9]{2}$", "", var.names.2 )
  
  shared <- intersect( var.names.1, var.names.2 ) %>% sort()
  print( "SHARED VARIABLES:")
  print( shared )
  
  not.shared <- c( setdiff( var.names.1, var.names.2 ),
                   setdiff( var.names.2, var.names.1 ) ) %>% sort()
  
  print( "NOT SHARED:" )
  print( not.shared )
  
  d.vars1 <- data.frame( type="shared", variables=shared, stringsAsFactors=F )
  d.vars2 <- data.frame( type="not shared", variables=not.shared, stringsAsFactors=F )
  dd <- rbind( d.vars1, d.vars2 )
  
  return( dd )
}

vars <- compare_dfs( df1=d1, df2=d2 )
## [1] "SHARED VARIABLES:"
##   [1] "a15asn"  "a15blk"  "a15hsp"  "a15ntv"  "a15wht"  "a18und"  "a60asn" 
##   [8] "a60blk"  "a60hsp"  "a60ntv"  "a60up"   "a60wht"  "a75up"   "ag15up" 
##  [15] "ag18cv"  "ag25up"  "ag5up"   "ageasn"  "ageblk"  "agehsp"  "agentv" 
##  [22] "agewht"  "asian"   "china"   "clf"     "col"     "cuban"   "dapov"  
##  [29] "dbpov"   "dflabf"  "dfmpov"  "dhpov"   "dmulti"  "dnapov"  "dpov"   
##  [36] "dwpov"   "empclf"  "family"  "fb"      "fhh"     "filip"   "flabf"  
##  [43] "geanc"   "gefb"    "h10yrs"  "h30old"  "haw"     "hh"      "hha"    
##  [50] "hhb"     "hhh"     "hhw"     "hinc"    "hinca"   "hincb"   "hinch"  
##  [57] "hincw"   "hisp"    "hs"      "hu"      "incpc"   "india"   "iranc"  
##  [64] "irfb"    "itanc"   "itfb"    "japan"   "korea"   "lep"     "manuf"  
##  [71] "mar"     "mex"     "mhmval"  "mrent"   "multi"   "n10imm"  "n65pov" 
##  [78] "napov"   "nat"     "nbpov"   "nfmpov"  "nhblk"   "nhpov"   "nhwht"  
##  [85] "nnapov"  "npov"    "ntv"     "nwpov"   "ohu"     "olang"   "own"    
##  [92] "pop"     "pr"      "prof"    "rent"    "ruanc"   "rufb"    "scanc"  
##  [99] "scfb"    "semp"    "tractid" "unemp"   "vac"     "vet"     "viet"   
## [106] "wds"    
## [1] "NOT SHARED:"
##  [1] "a65asn"  "a65blk"  "a65hsp"  "a65ntv"  "a65wht"  "cni16u"  "dis"    
##  [8] "hu00sp"  "ohu00sp" "p10imm"  "p10yrs"  "p15asn"  "p15blk"  "p15hsp" 
## [15] "p15ntv"  "p15wht"  "p18und"  "p30old"  "p60up"   "p65asn"  "p65blk" 
## [22] "p65hsp"  "p65ntv"  "p65pov"  "p65wht"  "p75up"   "papov"   "pasian" 
## [29] "pbpov"   "pchina"  "pcol"    "pcuban"  "pfb"     "pfhh"    "pfilip" 
## [36] "pflabf"  "pfmpov"  "pgeanc"  "pgefb"   "phaw"    "phisp"   "phpov"  
## [43] "phs"     "pindia"  "piranc"  "pirfb"   "pitanc"  "pitfb"   "pjapan" 
## [50] "pkorea"  "plep"    "pmanuf"  "pmar"    "pmex"    "pmulti"  "pnapov" 
## [57] "pnat"    "pnhblk"  "pnhwht"  "pntv"    "polang"  "pown"    "ppov"   
## [64] "ppr"     "pprof"   "pruanc"  "prufb"   "pscanc"  "pscfb"   "psemp"  
## [71] "punemp"  "pvac"    "pvet"    "pviet"   "pwds"    "pwpov"
head( vars )

Create Dataset for Analysis

Create subset for the analysis

d.full <- d  # keep a copy so don't have to reload 
d <- d.full  # story original in case you need to reset anything

d <- select( d, tractid, mhmval00, mhmval12, hinc00, 
             hu00, own00, rent00,  
             empclf00, clf00, unemp00, prof00,  
             dpov00, npov00,
             ag25up00, hs00, col00, 
             pop00.x, nhwht00, nhblk00, hisp00, asian00,
             cbsa, cbsaname )
d <- 
  d %>%
  mutate( p.white = 100 * nhwht00 / pop00.x,
          p.black = 100 * nhblk00 / pop00.x,
          p.hisp = 100 * hisp00 / pop00.x, 
          p.asian = 100 * asian00 / pop00.x,
          p.hs = 100 * (hs00+col00) / ag25up00,
          p.col = 100 * col00 / ag25up00,
          p.prof = 100 * prof00 / empclf00,
          p.unemp = 100 * unemp00 / clf00,
          pov.rate = 100 * npov00 / dpov00 )

If you call the stargazer() function with a linear model object (a regression model) it will create a nicely-formatted regression table for you.

If you instead use a data frame object it will create a table of descriptive statistics.

You can set the statistics you would like reported in the table with the summary.stat= argument.

stargazer( d, 
           type=s.type, 
           digits=0,
           summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic Min Pctl(25) Median Mean Pctl(75) Max
mhmval00 0 81,600 119,900 144,738 173,894 1,000,001
mhmval12 9,999 123,200 193,200 246,570 312,000 1,000,001
hinc00 2,499 33,000 43,825 47,657 58,036 200,001
hu00 0 1,102 1,519 1,570 1,999 11,522
own00 0 542 902 939 1,289 4,911
rent00 0 195 398 516 712 8,544
empclf00 0 1,205 1,756 1,820 2,373 10,334
clf00 0 1,302 1,865 1,930 2,502 11,251
unemp00 0 51 87 110 140 6,405
prof00 0 299 539 637 873 6,610
dpov00 0 2,671 3,718 3,804 4,871 23,892
npov00 0 149 304 452 601 5,515
ag25up00 0 1,763 2,451 2,520 3,224 17,974
hs00 0 665 1,071 1,155 1,552 8,909
col00 0 243 492 665 923 9,313
pop00.x 0 2,751 3,802 3,901 4,976 36,206
nhwht00 0 1,308 2,514 2,591 3,713 20,619
nhblk00 0 41 141 522 527 14,039
hisp00 0 55 153 547 533 13,391
asian00 0 22 65 189 183 9,491
p.white 0 47 78 67 91 100
p.black 0 1 4 14 14 100
p.hisp 0 2 4 13 15 100
p.asian 0 1 2 5 5 95
p.hs 0 67 72 72 77 100
p.col 0 12 21 26 36 100
p.prof 0 23 31 34 43 100
p.unemp 0 3 5 6 8 100
pov.rate 0 4 9 12 17 100

Exploration of Median Home Value

Initial conditions in 2000:

# adjust 2000 home values for inflation 
mhv.00 <- d$mhmval00 * 1.28855  
mhv.10 <- d$mhmval12

mhv.change <- mhv.10 - mhv.00

df <- data.frame( MedianHomeValue2000=mhv.00, 
                  MedianHomeValue2010=mhv.10, 
                  Change.00.to.10=mhv.change )

stargazer( df, 
           type=s.type, 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic Min Pctl(25) Median Mean Pctl(75) Max
MedianHomeValue2000 0 105,146 154,497 186,502 224,071 1,288,551
MedianHomeValue2010 9,999 123,200 193,200 246,570 312,000 1,000,001
Change.00.to.10 -1,228,651 7,187 36,268 60,047 94,881 1,000,001

A note on inflation calculations. You can google “inflation calculator” and you will find a bunch of sites that give you the conversion. Just select 2010 as the final year, your input year, and use $1 as the starting value. For example:

https://westegg.com/inflation/

You will get slightly different results by site.

Otherwise, you can always estimate it if you know the average long-term inflation rate. Since 2000 the rate has averaged about 2.5% in the US, so you can use:

# 10 year inflation factor
(1.025)^10
[1] 1.280085

Prior to 2000 the rates were higher and they fluctuated quite a bit through the 70’s and 80’s so if you are going back in time further it is better to use a calculator. The calculators should use the actual rates by year.

Histogram of MHV

hist( mhv.change/1000, breaks=500, 
      xlim=c(-100,500), yaxt="n", xaxt="n",
      xlab="Thousand of US Dollars (adjusted to 2010)", cex.lab=1.5,
      ylab="", main="Change in Median Home Value 2000 to 2010",
      col="gray20", border="white" )

axis( side=1, at=seq( from=-100, to=500, by=100 ), 
      labels=paste0( "$", seq( from=-100, to=500, by=100 ), "k" ) )
        
mean.x <- mean( mhv.change/1000, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=200, y=1500, 
      labels=paste0( "Mean = ", dollar( round(1000*mean.x,0)) ), 
      col="darkorange", cex=1.8, pos=3 )

median.x <- median( mhv.change/1000, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=200, y=2000, 
      labels=paste0( "Median = ", dollar( round(1000*median.x,0)) ), 
      col="dodgerblue", cex=1.8, pos=3 )

# function to control plot() formatting 
jplot <- function( x1, x2, lab1="", lab2="", draw.line=T, ... )
{

    plot( x1, x2,
          pch=19, 
          col=gray(0.6, alpha = 0.2), 
          cex=2.5,  
          bty = "n",
          xlab=lab1, 
          ylab=lab2, cex.lab=1.5,
        ... )

    if( draw.line==T ){ 
        ok <- is.finite(x1) & is.finite(x2)
        lines( lowess(x2[ok]~x1[ok]), col="red", lwd=3 ) }

}

Compare 2000 to 2010 distributions.

layout.matrix <- matrix( c( 1,3,
                            2,3 ), 
                nrow=2, ncol=2, byrow=T )

layout( mat = layout.matrix,
        heights = c(2,2), # Heights of the two rows
        widths =  c(3,4)) # Widths of the two columns

# layout.show(3)

par( mar=c(4,0,0,2) )

hist( mhv.00/1000, breaks=50, 
      xlim=c(-200,800), yaxt="n", xaxt="n",
      xlab="", cex.lab=1,
      ylab="", main="",
      col="darkslateblue", border="white" )

axis( side=1, at=seq( from=0, to=1000, by=100 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )

abline( v=seq(0,1000,100), lty=2, col="gray80" )

text( 550, 4000, labels="Median Home \nValue in 2000", 
      col="darkslateblue", cex=1.8 )



hist( mhv.10/1000, breaks=50, 
      xlim=c(-200,800), yaxt="n", xaxt="n",
      xlab="", cex.lab=1,
      ylab="", main="",
      col="darkslateblue", border="white" )

abline( v=seq(0,1000, 100 ), lty=2, col="gray80" )

text( 550, 3500, labels="Median Home \nValue in 2010", 
      col="darkslateblue", cex=1.8 )

axis( side=1, at=seq( from=0, to=1000, by=100 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )


# data reduction - filter 1,000 observations

df <- data.frame( v00=mhv.00/1000, v10=mhv.10/1000 )
df <- sample_n( df, 1000 )

par( mar=c(4,5,3,2) )

jplot( df$v00, df$v10, 
       lab1="MHV in 2000", lab2="MHV in 2010",
       xlim=c(0,1000), ylim=c(0,1000),
       axes=F )

abline( a=0, b=1, lty=2, col="gray" )
axis( side=1, at=seq( from=0, to=1000, by=200 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
axis( side=2, at=seq( from=0, to=1000, by=200 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )

Change in MHV 2000-2010

If a home worth $10 million increased in value by $100k over ten years it would not be that surprising. If a home worth $50k increased by $100k over the same period that is a growth of 200% and is notable.

The change in value variable only reports absolute change, but does not provide a sense of whether that is a big or small amount for the census tract.

The percent change variable provides some context for growth rates of value in census tracts.

# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?

# interpretation is hard if there were no homes in 2000
# and thus an artificially low MHV. i don't trust cases
# that go from homes worth $10k to regular value
# because it is more likely errors in data or noise
# than meaningful variance 
#
# quick filter to remove all of the problematic obs
# but need to go back and see which cases are problematic


mhv.00[ mhv.00 < 10000 ] <- NA
pct.change <- mhv.change / mhv.00
summary( pct.change )
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -0.96891  0.05918  0.25402  0.33167  0.49556 60.59261      220
# how many cases had increases above 500%
sum( pct.change > 5, na.rm=T )
## [1] 112
# preview tracts with large increases in home values 
# to see if increases make sense 

d %>% 
  filter( pct.change > 5 ) %>% 
  head()

Plot the percent change variable:

hg <-
hist( pct.change, breaks=2000, 
      xlim=c(-1,2), yaxt="n", xaxt="n",
      xlab="", cex.main=1.5,
      ylab="", main="Growth in Home Value by Census Tract 2000 to 2010",
      col="gray40", border="white" )

axis( side=1, at=seq( from=-1, to=2, by=0.5 ), 
      labels=paste0( seq( from=-100, to=200, by=50 ), "%" ) )

ymax <- max( hg$count )
        
mean.x <- mean( pct.change, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=1, y=(0.5*ymax), 
      labels=paste0( "Mean = ", round(100*mean.x,0), "%"), 
      col="darkorange", cex=1.8, pos=4 )

median.x <- median( pct.change, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=1, y=(0.6*ymax), 
      labels=paste0( "Median = ", round(100*median.x,0), "%"), 
      col="dodgerblue", cex=1.8, pos=4 )

Group Growth Rates By Metro Area

We often want to disagregate descriptives by some grouping in the data, such as metro areas.

dplyr makes this easy by grouping then summarizing the data.

d$mhv.change <- mhv.change 
d$pct.change <- pct.change
d$mhv.10 <- mhv.10
d$mhv.00 <- mhv.00

d %>%
  group_by( cbsaname ) %>%
  summarize( ave.change = median( mhv.change, na.rm=T ),
             ave.change.d = dollar( round(ave.change,0) ),
             growth = 100 * median( pct.change, na.rm=T ) ) %>%
  ungroup() %>%
  arrange( - growth ) %>%
  select( - ave.change ) %>% 
  head( 25 ) %>%
  pander()
cbsaname ave.change.d growth
Ocean City, NJ $154,667 93.07
Virginia Beach-Norfolk-Newport News, VA $97,955 71.81
Casper, WY $70,770 70.03
New York-Wayne-White Plains, NY-NJ $189,118 69.86
Kingston, NY $89,272 65.08
Barnstable Town, MA $146,088 65.07
Washington-Arlington-Alexandria DC-VA $139,136 64.82
Charlottesville, VA $104,487 62.74
Atlan City, NJ $94,239 62.32
Baltimore-Towson, MD $102,918 61.9
Bethesda-Frederick-Gaithersburg, MD $146,639 61.46
San Luis Obispo-Paso Robles, CA $172,722 61.01
Midland, TX $54,576 60.3
Los Angeles-Long Beach-Santa Ana, CA $145,968 60.3
Redding, CA $87,677 59.12
Honolulu, HI $194,665 58.85
Chico, CA $81,746 57.83
Nassau-Suffolk, NY $149,785 57.62
Edison, NJ $125,056 57.19
Santa Ana-Anaheim-Irvine, CA $177,367 56.05
Poughkeepsie-Newburgh-Middletown, NY $101,010 55.18
Flagstaff, AZ $71,072 54.67
Salisbury, MD $60,400 54.06
Winchester, VA-WV $73,165 53.76
Odessa, TX $26,240 53.37

Measuring Gentrification

The original merged dataset we saved as d.full so we don’t need to reload it:

Recall our data steps thus far:

# adjust 2000 home values for inflation 
mhv.00 <- d.full$mhmval00 * 1.28855  
mhv.10 <- d.full$mhmval12

mhv.change <- mhv.10 - mhv.00

# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?

mhv.00[ mhv.00 < 10000 ] <- NA
pct.change <- 100 * ( mhv.change / mhv.00 )
summary( pct.change )
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  -96.891    5.918   25.402   33.167   49.556 6059.261      220
d.full$mhv.00 <- mhv.00
d.full$mhv.10 <- mhv.10
d.full$mhv.change <- mhv.change
d.full$pct.change <- pct.change

Select Gentrification Variables

Select variables for operationalizing a definition of gentrification:

We need to add some variables from 2010:

Recall we created a 2000 to 2010 variable list for reference:

head( vars )
d3 <- select( d.full, 
             
             tractid, cbsa, cbsaname,            # ids / units of analysis
             
             mhv.00, mhv.10, mhv.change, pct.change,    # home value 
             
             hinc00, hu00, own00, rent00,        # ses
             hinc12, hu10, own10, rent10,
             
             empclf00, clf00, unemp00, prof00,   # employment 
             empclf12, clf12, unemp12, prof12,
             
             dpov00, npov00,                     # poverty
             dpov12, npov12,
             
             ag25up00, hs00, col00,              # education 
             ag25up12, hs12, col12,
             
             pop00.x, nhwht00, nhblk00, hisp00, asian00,   # race
             pop10, nhwht10, nhblk10, hisp10, asian10
             
          ) # end select


d3 <- 
  d3 %>%
  mutate( 
          # 2000 variables
          p.white.00 = 100 * nhwht00 / pop00.x,
          p.black.00 = 100 * nhblk00 / pop00.x,
          p.hisp.00 = 100 * hisp00 / pop00.x, 
          p.asian.00 = 100 * asian00 / pop00.x,
          p.hs.edu.00 = 100 * (hs00+col00) / ag25up00,
          p.col.edu.00 = 100 * col00 / ag25up00,
          p.prof.00 = 100 * prof00 / empclf00,
          p.unemp.00 = 100 * unemp00 / clf00,
          pov.rate.00 = 100 * npov00 / dpov00,
          
          # 2010 variables
          p.white.10 = 100 * nhwht10 / pop10,
          p.black.10 = 100 * nhblk10 / pop10,
          p.hisp.10 = 100 * hisp10 / pop10, 
          p.asian.10 = 100 * asian10 / pop10,
          p.hs.edu.10 = 100 * (hs12+col12) / ag25up12,
          p.col.edu.10 = 100 * col12 / ag25up12,
          p.prof.10 = 100 * prof12 / empclf12,
          p.unemp.10 = 100 * unemp12 / clf12,
          pov.rate.10 = 100 * npov12 / dpov12 )
d3 <-
  d3 %>%
  group_by( cbsaname ) %>%
  mutate( metro.mhv.pct.00 = ntile( mhv.00, 100 ),
          metro.mhv.pct.10 = ntile( mhv.10, 100 ),
          metro.median.pay.00 = median( hinc00, na.rm=T ),
          metro.median.pay.10 = median( hinc12, na.rm=T ),
          metro.race.rank.00 = ntile( (100-p.white.00), 100 ) ) %>%
  ungroup() %>%
  mutate( metro.mhv.pct.change = metro.mhv.pct.10 - metro.mhv.pct.00,
          pay.change = metro.median.pay.10 - metro.median.pay.00,
          race.change = p.white.10 - p.white.00,
          mhv.change = mhv.10 - mhv.00 )

Descriptive Statistics of Change Variables

d3 <-           
  d3 %>%
  select( c( "tractid", "cbsa", "cbsaname",
             "mhv.00", "mhv.10", "mhv.change","pct.change",
          "p.white.00", "p.black.00", "p.hisp.00", "p.asian.00", 
          "p.hs.edu.00", "p.col.edu.00", "p.prof.00",  "p.unemp.00", 
          "pov.rate.00", "p.white.10", "p.black.10", "p.hisp.10", 
          "p.asian.10", "p.hs.edu.10", "p.col.edu.10", "p.prof.10", 
          "p.unemp.10", "pov.rate.10", "metro.mhv.pct.00", 
          "metro.mhv.pct.10", "metro.median.pay.00", "metro.median.pay.10", 
          "metro.mhv.pct.change", "pay.change", "race.change",
          "metro.race.rank.00") ) 
  
# head( d3 ) %>% pander()
d3 <- data.frame(d3)
stargazer( d3, 
           type=s.type, 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic Min Pctl(25) Median Mean Pctl(75) Max
mhv.00 11,167 105,661 154,903 187,129 224,337 1,288,551
mhv.10 9,999 123,200 193,200 246,570 312,000 1,000,001
mhv.change -1,228,651 7,101 35,981 59,372 94,140 983,765
pct.change -97 6 25 33 50 6,059
p.white.00 0 47 78 67 91 100
p.black.00 0 1 4 14 14 100
p.hisp.00 0 2 4 13 15 100
p.asian.00 0 1 2 5 5 95
p.hs.edu.00 0 67 72 72 77 100
p.col.edu.00 0 12 21 26 36 100
p.prof.00 0 23 31 34 43 100
p.unemp.00 0 3 5 6 8 100
pov.rate.00 0 4 9 12 17 100
p.white.10 0 37 70 60 87 100
p.black.10 0 2 5 15 17 100
p.hisp.10 0 3 7 17 21 100
p.asian.10 0 1 3 6 6 96
p.hs.edu.10 0 66 71 71 77 100
p.col.edu.10 0 15 25 30 41 100
p.prof.10 0 24 34 35 46 100
p.unemp.10 0 6 9 10 13 100
pov.rate.10 0 6 12 16 22 100
metro.mhv.pct.00 1 20 41 45 68 100
metro.mhv.pct.10 1 20 41 45 68 100
metro.median.pay.00 23,012 39,457 43,139 45,054 49,522 73,701
metro.median.pay.10 30,337 47,736 54,728 55,905 60,119 96,033
metro.mhv.pct.change -99 -6 -1 0 4 99
pay.change -3,207 7,495 9,764 10,851 13,809 27,310
race.change -100 -10 -5 -6 -1 90
metro.race.rank.00 1 20 41 45 68 100

Operationalizing Gentrification

Which definition did you select for gentrification, and how would you operationalize it?

# income
# percent white
# home values absolute
# home value relative to metro
# education stats ?
# employment stats ?
# income stats ?
# growth of pop per tract (density) ?


# home value in lower than average home in a metro in 2000
poor.2000 <- d3$metro.mhv.pct.00 < 50  

# above average diversity for metro
diverse.2000 <- d3$metro.race.rank.00 > 50 

# home values increased more than overall city gains 
# change in percentile rank within the metro
mhv.pct.increase <- d3$metro.mhv.pct.change > 0

# faster than average growth  
# 25% growth in value is median for the country
home.val.rise <- d3$pct.change > 25 

# proportion of whites increases by more than 3 percent 
# measured by increase in white
loss.diversity <- d3$race.change > 3 

g.flag <- poor.2000 & diverse.2000 & mhv.pct.increase & home.val.rise & loss.diversity

num.candidates <-  sum( poor.2000 & diverse.2000, na.rm=T )
num.gentrified <- sum( g.flag, na.rm=T )

num.gentrified 
## [1] 1137
num.candidates
## [1] 18219
num.gentrified / num.candidates
## [1] 0.06240738

By this definition only 5.7 percent of urban tracts experience gentrification between 2000 and 2010.

This might skew numbers?

# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?

mhv.00[ mhv.00 < 1000 ] <- NA
pct.change <- 100 * ( mhv.change / mhv.00 )
summary( pct.change )
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  -96.891    5.918   25.402   33.167   49.556 6059.261      220

Discussion

What do you think? Is that the right way to operationalize it? Do you care about relative diversity (more diverse neighborhood than rest of the city) or absolute (percentage of non-whites regardless of city diversity).

Do we care about absolute increase in value, or relative to the national average? The national average would include all of the gentrifying tracts, which could skew it upward and thus make it a poor benchmark? Maybe look at the average increase in value for non-gentrification candadidates?

Spatial Visualization

Dust off your GIS skills from CPP 529 so that you can visualize some of your metrics.

You will need to pick one or more cities that you can use as examples in your report. It is strongly recommended that you create a dorling cartogram for reporting since Census tracts introduce visual bias by over-empasizing lower density tracts and hiding a lot of the data where the greatest number of people reside in the city. Recall that Dorling cartograms correct for this by re-sizing administrative units proportion to the size of the population they represent.

The lab from CPP 528 that covers creating Dorling cartograms is locate here:

https://ds4ps.org/cpp-529-master/labs/lab-04-instructions.html

# devtools::install_github( "sjewo/cartogram" )
# install.packages( "tmap" )

library( geojsonio )  # read geoJSON map files from GitHub
library( sp )         # spatial data class sp for shapefiles
library( cartogram )  # spatial maps w/ tract size bias reduction
library( tmap )       # thematic maps
library( maptools )   # spatial object manipulation 
library( sf )         # 'simple features' flavor of shapefiles


# we have phoenix already packaged and on GitHub for easy load: 

github.url <- "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/phx_dorling.geojson"
phx <- geojson_read( x=github.url,  what="sp" )
plot( phx )

# create small dataframe for the merge
df <- data.frame(  tractid=d$tractid, 
        mhv.00,  mhv.10,  mhv.change,  pct.change  )

# create GEOID that matches GIS format

# create a geoID for merging by tract 
df$GEOID <- substr( df$tractid, 6, 18 )  # extract codes
df$GEOID <- gsub( "-", "", df$GEOID )    # remove hyphens
class( df$GEOID )
## [1] "character"
head( df$GEOID )
## [1] "01001020100" "01001020200" "01001020300" "01001020400" "01001020500"
## [6] "01001020600"

Note the current class of the PHX shapefile IDs:

  • GEOID2 is numeric, has leading zero dropped
  • GEOID is a factor that retains leading zeros

You can either convert the data frame ID to numeric to drop the leading zero and merge with GEOID2.

Or keep it as a character vector and merge with GEOID.

head( phx@data )  # sp class from GIS file, so data frame is located @data
# merge census data with dorling map

nrow( phx ) # check dimensions
## [1] 913
phx <- merge( phx, df, by.x="GEOID", by.y="GEOID" )

# make sure they have not changed or 
# you are missing rows in your data frame
# or merging with the wrong ID
nrow( phx ) 
## [1] 913
phx <- spTransform( phx, CRS("+init=epsg:3395") )

bb <- st_bbox( c( xmin = -12519146, xmax = -12421368, 
                 ymax = 3965924, ymin = 3899074 ), 
               crs = st_crs("+init=epsg:3395")) 

tm_shape( phx, bbox=bb ) + 
  tm_polygons( col="mhv.00", n=10, style="quantile", palette="Spectral" ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )

tm_shape( phx, bbox=bb ) + 
  tm_polygons( col="mhv.change", n=10, style="quantile", palette="Spectral" ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )

tm_shape( phx, bbox=bb ) + 
  tm_polygons( col="pct.change", n=10, style="quantile", palette="Spectral" ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )