library( dplyr )
library( here )
library( knitr )
library( pander )
library( stargazer )
library( scales )
# set stargazer type to text for
# previewing in RMD docs but
# convert to type HTML when knitting
# (next code chunk)
s.type <- "text"
###################################
#
# STARGAZER SETTINGS
#
###################################
# DO NOT RUN CHUNK UNLESS KNITTING:
# changes table formats to html
# before rendering RMD docs
s.type <- "html"
For this part of the project you will:
This tutorial demonstrates some of the initial steps of variable creation and data exploration for the project.
See the data steps for the wrangling that occurs during the process of creating our rodeo datasets.
here::here()
functionI cannot stress this enough: all data files must be imported using relative paths. You cannot use a static file (i.e. /Users/myname/some/file/path
) because it breaks portability and therefore makes it impossible for anyone to reproduce your analysis.
Reason being is that no one else can rerun your file because that static file path belongs to you and only you.For a refresher on why you cannot use static file paths and need to use the here::here()
function, please see Week 02’s lecture videos.
The following chunks depend on you having clean data in your data/rodeo
folder. These files are generated by following along the Data Steps Tutorial tutorial.
To re-run the tutorial locally, follow these steps:
File
–> New File
–> Text File
..rmd
file;.rmd
code into the blank text file.labs/wk03/data_steps.rmd
. You do not have to store it in the WK03 sub-directory but you must save the file with the file extension .rmd
.Knit
to knit the results to a HTML file and to produce all of the clean data files within data/rodeo
.# note: please do not use static file paths
# note: notice down below the use of here::here()
d1 <- readRDS( here::here( "data/rodeo/LTDB-2000.rds" ) )
d2 <- readRDS( here::here( "data/rodeo/LTDB-2010.rds" ) )
md <- readRDS( here::here( "data/rodeo/LTDB-META-DATA.rds" ) )
# check to make sure we are not losing
# or gaining observations in the merge
nrow( d1 )
## [1] 72693
d1 <- select( d1, - year )
d2 <- select( d2, - year )
d <- merge( d1, d2, by="tractid" )
d <- merge( d, md, by="tractid" )
nrow( d )
## [1] 72693
##
## rural urban
## 12971 59722
We can create a function to compare variables from the 2000 and 2010 datasets:
# find variables that are in both files
compare_dfs <- function( df1, df2 )
{
# use regular expressions to remove numeric suffixes
var.names.1 <- names( df1 )
var.names.1 <- gsub( "[.][xy]$", "", var.names.1 )
var.names.1 <- gsub( "[0-9]{2}$", "", var.names.1 )
var.names.2 <- names( df2 )
var.names.2 <- gsub( "[.][xy]$", "", var.names.2 )
var.names.2 <- gsub( "[0-9]{2}$", "", var.names.2 )
shared <- intersect( var.names.1, var.names.2 ) %>% sort()
print( "SHARED VARIABLES:")
print( shared )
not.shared <- c( setdiff( var.names.1, var.names.2 ),
setdiff( var.names.2, var.names.1 ) ) %>% sort()
print( "NOT SHARED:" )
print( not.shared )
d.vars1 <- data.frame( type="shared", variables=shared, stringsAsFactors=F )
d.vars2 <- data.frame( type="not shared", variables=not.shared, stringsAsFactors=F )
dd <- rbind( d.vars1, d.vars2 )
return( dd )
}
vars <- compare_dfs( df1=d1, df2=d2 )
## [1] "SHARED VARIABLES:"
## [1] "a15asn" "a15blk" "a15hsp" "a15ntv" "a15wht" "a18und" "a60asn"
## [8] "a60blk" "a60hsp" "a60ntv" "a60up" "a60wht" "a75up" "ag15up"
## [15] "ag18cv" "ag25up" "ag5up" "ageasn" "ageblk" "agehsp" "agentv"
## [22] "agewht" "asian" "china" "clf" "col" "cuban" "dapov"
## [29] "dbpov" "dflabf" "dfmpov" "dhpov" "dmulti" "dnapov" "dpov"
## [36] "dwpov" "empclf" "family" "fb" "fhh" "filip" "flabf"
## [43] "geanc" "gefb" "h10yrs" "h30old" "haw" "hh" "hha"
## [50] "hhb" "hhh" "hhw" "hinc" "hinca" "hincb" "hinch"
## [57] "hincw" "hisp" "hs" "hu" "incpc" "india" "iranc"
## [64] "irfb" "itanc" "itfb" "japan" "korea" "lep" "manuf"
## [71] "mar" "mex" "mhmval" "mrent" "multi" "n10imm" "n65pov"
## [78] "napov" "nat" "nbpov" "nfmpov" "nhblk" "nhpov" "nhwht"
## [85] "nnapov" "npov" "ntv" "nwpov" "ohu" "olang" "own"
## [92] "pop" "pr" "prof" "rent" "ruanc" "rufb" "scanc"
## [99] "scfb" "semp" "tractid" "unemp" "vac" "vet" "viet"
## [106] "wds"
## [1] "NOT SHARED:"
## [1] "a65asn" "a65blk" "a65hsp" "a65ntv" "a65wht" "cni16u" "dis"
## [8] "hu00sp" "ohu00sp" "p10imm" "p10yrs" "p15asn" "p15blk" "p15hsp"
## [15] "p15ntv" "p15wht" "p18und" "p30old" "p60up" "p65asn" "p65blk"
## [22] "p65hsp" "p65ntv" "p65pov" "p65wht" "p75up" "papov" "pasian"
## [29] "pbpov" "pchina" "pcol" "pcuban" "pfb" "pfhh" "pfilip"
## [36] "pflabf" "pfmpov" "pgeanc" "pgefb" "phaw" "phisp" "phpov"
## [43] "phs" "pindia" "piranc" "pirfb" "pitanc" "pitfb" "pjapan"
## [50] "pkorea" "plep" "pmanuf" "pmar" "pmex" "pmulti" "pnapov"
## [57] "pnat" "pnhblk" "pnhwht" "pntv" "polang" "pown" "ppov"
## [64] "ppr" "pprof" "pruanc" "prufb" "pscanc" "pscfb" "psemp"
## [71] "punemp" "pvac" "pvet" "pviet" "pwds" "pwpov"
Create subset for the analysis
d <- d.full # story original in case you need to reset anything
d <- select( d, tractid, mhmval00, mhmval12, hinc00,
hu00, own00, rent00,
empclf00, clf00, unemp00, prof00,
dpov00, npov00,
ag25up00, hs00, col00,
pop00.x, nhwht00, nhblk00, hisp00, asian00,
cbsa, cbsaname )
d <-
d %>%
mutate( p.white = 100 * nhwht00 / pop00.x,
p.black = 100 * nhblk00 / pop00.x,
p.hisp = 100 * hisp00 / pop00.x,
p.asian = 100 * asian00 / pop00.x,
p.hs = 100 * (hs00+col00) / ag25up00,
p.col = 100 * col00 / ag25up00,
p.prof = 100 * prof00 / empclf00,
p.unemp = 100 * unemp00 / clf00,
pov.rate = 100 * npov00 / dpov00 )
If you call the stargazer() function with a linear model object (a regression model) it will create a nicely-formatted regression table for you.
If you instead use a data frame object it will create a table of descriptive statistics.
You can set the statistics you would like reported in the table with the summary.stat= argument.
Statistic | Min | Pctl(25) | Median | Mean | Pctl(75) | Max |
mhmval00 | 0 | 81,600 | 119,900 | 144,738 | 173,894 | 1,000,001 |
mhmval12 | 9,999 | 123,200 | 193,200 | 246,570 | 312,000 | 1,000,001 |
hinc00 | 2,499 | 33,000 | 43,825 | 47,657 | 58,036 | 200,001 |
hu00 | 0 | 1,102 | 1,519 | 1,570 | 1,999 | 11,522 |
own00 | 0 | 542 | 902 | 939 | 1,289 | 4,911 |
rent00 | 0 | 195 | 398 | 516 | 712 | 8,544 |
empclf00 | 0 | 1,205 | 1,756 | 1,820 | 2,373 | 10,334 |
clf00 | 0 | 1,302 | 1,865 | 1,930 | 2,502 | 11,251 |
unemp00 | 0 | 51 | 87 | 110 | 140 | 6,405 |
prof00 | 0 | 299 | 539 | 637 | 873 | 6,610 |
dpov00 | 0 | 2,671 | 3,718 | 3,804 | 4,871 | 23,892 |
npov00 | 0 | 149 | 304 | 452 | 601 | 5,515 |
ag25up00 | 0 | 1,763 | 2,451 | 2,520 | 3,224 | 17,974 |
hs00 | 0 | 665 | 1,071 | 1,155 | 1,552 | 8,909 |
col00 | 0 | 243 | 492 | 665 | 923 | 9,313 |
pop00.x | 0 | 2,751 | 3,802 | 3,901 | 4,976 | 36,206 |
nhwht00 | 0 | 1,308 | 2,514 | 2,591 | 3,713 | 20,619 |
nhblk00 | 0 | 41 | 141 | 522 | 527 | 14,039 |
hisp00 | 0 | 55 | 153 | 547 | 533 | 13,391 |
asian00 | 0 | 22 | 65 | 189 | 183 | 9,491 |
p.white | 0 | 47 | 78 | 67 | 91 | 100 |
p.black | 0 | 1 | 4 | 14 | 14 | 100 |
p.hisp | 0 | 2 | 4 | 13 | 15 | 100 |
p.asian | 0 | 1 | 2 | 5 | 5 | 95 |
p.hs | 0 | 67 | 72 | 72 | 77 | 100 |
p.col | 0 | 12 | 21 | 26 | 36 | 100 |
p.prof | 0 | 23 | 31 | 34 | 43 | 100 |
p.unemp | 0 | 3 | 5 | 6 | 8 | 100 |
pov.rate | 0 | 4 | 9 | 12 | 17 | 100 |
Initial conditions in 2000:
# adjust 2000 home values for inflation
mhv.00 <- d$mhmval00 * 1.28855
mhv.10 <- d$mhmval12
mhv.change <- mhv.10 - mhv.00
df <- data.frame( MedianHomeValue2000=mhv.00,
MedianHomeValue2010=mhv.10,
Change.00.to.10=mhv.change )
stargazer( df,
type=s.type,
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic | Min | Pctl(25) | Median | Mean | Pctl(75) | Max |
MedianHomeValue2000 | 0 | 105,146 | 154,497 | 186,502 | 224,071 | 1,288,551 |
MedianHomeValue2010 | 9,999 | 123,200 | 193,200 | 246,570 | 312,000 | 1,000,001 |
Change.00.to.10 | -1,228,651 | 7,187 | 36,268 | 60,047 | 94,881 | 1,000,001 |
A note on inflation calculations. You can google “inflation calculator” and you will find a bunch of sites that give you the conversion. Just select 2010 as the final year, your input year, and use $1 as the starting value. For example:
https://westegg.com/inflation/
You will get slightly different results by site.
Otherwise, you can always estimate it if you know the average long-term inflation rate. Since 2000 the rate has averaged about 2.5% in the US, so you can use:
Prior to 2000 the rates were higher and they fluctuated quite a bit through the 70’s and 80’s so if you are going back in time further it is better to use a calculator. The calculators should use the actual rates by year.
hist( mhv.change/1000, breaks=500,
xlim=c(-100,500), yaxt="n", xaxt="n",
xlab="Thousand of US Dollars (adjusted to 2010)", cex.lab=1.5,
ylab="", main="Change in Median Home Value 2000 to 2010",
col="gray20", border="white" )
axis( side=1, at=seq( from=-100, to=500, by=100 ),
labels=paste0( "$", seq( from=-100, to=500, by=100 ), "k" ) )
mean.x <- mean( mhv.change/1000, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=200, y=1500,
labels=paste0( "Mean = ", dollar( round(1000*mean.x,0)) ),
col="darkorange", cex=1.8, pos=3 )
median.x <- median( mhv.change/1000, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=200, y=2000,
labels=paste0( "Median = ", dollar( round(1000*median.x,0)) ),
col="dodgerblue", cex=1.8, pos=3 )
# function to control plot() formatting
jplot <- function( x1, x2, lab1="", lab2="", draw.line=T, ... )
{
plot( x1, x2,
pch=19,
col=gray(0.6, alpha = 0.2),
cex=2.5,
bty = "n",
xlab=lab1,
ylab=lab2, cex.lab=1.5,
... )
if( draw.line==T ){
ok <- is.finite(x1) & is.finite(x2)
lines( lowess(x2[ok]~x1[ok]), col="red", lwd=3 ) }
}
Compare 2000 to 2010 distributions.
layout.matrix <- matrix( c( 1,3,
2,3 ),
nrow=2, ncol=2, byrow=T )
layout( mat = layout.matrix,
heights = c(2,2), # Heights of the two rows
widths = c(3,4)) # Widths of the two columns
# layout.show(3)
par( mar=c(4,0,0,2) )
hist( mhv.00/1000, breaks=50,
xlim=c(-200,800), yaxt="n", xaxt="n",
xlab="", cex.lab=1,
ylab="", main="",
col="darkslateblue", border="white" )
axis( side=1, at=seq( from=0, to=1000, by=100 ),
labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )
abline( v=seq(0,1000,100), lty=2, col="gray80" )
text( 550, 4000, labels="Median Home \nValue in 2000",
col="darkslateblue", cex=1.8 )
hist( mhv.10/1000, breaks=50,
xlim=c(-200,800), yaxt="n", xaxt="n",
xlab="", cex.lab=1,
ylab="", main="",
col="darkslateblue", border="white" )
abline( v=seq(0,1000, 100 ), lty=2, col="gray80" )
text( 550, 3500, labels="Median Home \nValue in 2010",
col="darkslateblue", cex=1.8 )
axis( side=1, at=seq( from=0, to=1000, by=100 ),
labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )
# data reduction - filter 1,000 observations
df <- data.frame( v00=mhv.00/1000, v10=mhv.10/1000 )
df <- sample_n( df, 1000 )
par( mar=c(4,5,3,2) )
jplot( df$v00, df$v10,
lab1="MHV in 2000", lab2="MHV in 2010",
xlim=c(0,1000), ylim=c(0,1000),
axes=F )
abline( a=0, b=1, lty=2, col="gray" )
axis( side=1, at=seq( from=0, to=1000, by=200 ),
labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
axis( side=2, at=seq( from=0, to=1000, by=200 ),
labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
If a home worth $10 million increased in value by $100k over ten years it would not be that surprising. If a home worth $50k increased by $100k over the same period that is a growth of 200% and is notable.
The change in value variable only reports absolute change, but does not provide a sense of whether that is a big or small amount for the census tract.
The percent change variable provides some context for growth rates of value in census tracts.
# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?
# interpretation is hard if there were no homes in 2000
# and thus an artificially low MHV. i don't trust cases
# that go from homes worth $10k to regular value
# because it is more likely errors in data or noise
# than meaningful variance
#
# quick filter to remove all of the problematic obs
# but need to go back and see which cases are problematic
mhv.00[ mhv.00 < 10000 ] <- NA
pct.change <- mhv.change / mhv.00
summary( pct.change )
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -0.96891 0.05918 0.25402 0.33167 0.49556 60.59261 220
## [1] 112
# preview tracts with large increases in home values
# to see if increases make sense
d %>%
filter( pct.change > 5 ) %>%
head()
Plot the percent change variable:
hg <-
hist( pct.change, breaks=2000,
xlim=c(-1,2), yaxt="n", xaxt="n",
xlab="", cex.main=1.5,
ylab="", main="Growth in Home Value by Census Tract 2000 to 2010",
col="gray40", border="white" )
axis( side=1, at=seq( from=-1, to=2, by=0.5 ),
labels=paste0( seq( from=-100, to=200, by=50 ), "%" ) )
ymax <- max( hg$count )
mean.x <- mean( pct.change, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=1, y=(0.5*ymax),
labels=paste0( "Mean = ", round(100*mean.x,0), "%"),
col="darkorange", cex=1.8, pos=4 )
median.x <- median( pct.change, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=1, y=(0.6*ymax),
labels=paste0( "Median = ", round(100*median.x,0), "%"),
col="dodgerblue", cex=1.8, pos=4 )
We often want to disagregate descriptives by some grouping in the data, such as metro areas.
dplyr makes this easy by grouping then summarizing the data.
d$mhv.change <- mhv.change
d$pct.change <- pct.change
d$mhv.10 <- mhv.10
d$mhv.00 <- mhv.00
d %>%
group_by( cbsaname ) %>%
summarize( ave.change = median( mhv.change, na.rm=T ),
ave.change.d = dollar( round(ave.change,0) ),
growth = 100 * median( pct.change, na.rm=T ) ) %>%
ungroup() %>%
arrange( - growth ) %>%
select( - ave.change ) %>%
head( 25 ) %>%
pander()
cbsaname | ave.change.d | growth |
---|---|---|
Ocean City, NJ | $154,667 | 93.07 |
Virginia Beach-Norfolk-Newport News, VA | $97,955 | 71.81 |
Casper, WY | $70,770 | 70.03 |
New York-Wayne-White Plains, NY-NJ | $189,118 | 69.86 |
Kingston, NY | $89,272 | 65.08 |
Barnstable Town, MA | $146,088 | 65.07 |
Washington-Arlington-Alexandria DC-VA | $139,136 | 64.82 |
Charlottesville, VA | $104,487 | 62.74 |
Atlan City, NJ | $94,239 | 62.32 |
Baltimore-Towson, MD | $102,918 | 61.9 |
Bethesda-Frederick-Gaithersburg, MD | $146,639 | 61.46 |
San Luis Obispo-Paso Robles, CA | $172,722 | 61.01 |
Midland, TX | $54,576 | 60.3 |
Los Angeles-Long Beach-Santa Ana, CA | $145,968 | 60.3 |
Redding, CA | $87,677 | 59.12 |
Honolulu, HI | $194,665 | 58.85 |
Chico, CA | $81,746 | 57.83 |
Nassau-Suffolk, NY | $149,785 | 57.62 |
Edison, NJ | $125,056 | 57.19 |
Santa Ana-Anaheim-Irvine, CA | $177,367 | 56.05 |
Poughkeepsie-Newburgh-Middletown, NY | $101,010 | 55.18 |
Flagstaff, AZ | $71,072 | 54.67 |
Salisbury, MD | $60,400 | 54.06 |
Winchester, VA-WV | $73,165 | 53.76 |
Odessa, TX | $26,240 | 53.37 |
The original merged dataset we saved as d.full so we don’t need to reload it:
Recall our data steps thus far:
# adjust 2000 home values for inflation
mhv.00 <- d2$mhmval00 * 1.28855
mhv.10 <- d2$mhmval12
mhv.change <- mhv.10 - mhv.00
# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?
mhv.00[ mhv.00 < 10000 ] <- NA
pct.change <- 100 * ( mhv.change / mhv.00 )
summary( pct.change )
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -96.891 5.918 25.402 33.167 49.556 6059.261 220
Select variables for operationalizing a definition of gentrification:
We need to add some variables from 2010:
Recall we created a 2000 to 2010 variable list for reference:
d2 <- select( d2,
tractid, cbsa, cbsaname, # ids / units of analysis
mhv.00, mhv.10, mhv.change, pct.change, # home value
hinc00, hu00, own00, rent00, # ses
hinc12, hu10, own10, rent10,
empclf00, clf00, unemp00, prof00, # employment
empclf12, clf12, unemp12, prof12,
dpov00, npov00, # poverty
dpov12, npov12,
ag25up00, hs00, col00, # education
ag25up12, hs12, col12,
pop00.x, nhwht00, nhblk00, hisp00, asian00, # race
pop10, nhwht10, nhblk10, hisp10, asian10
) # end select
d2 <-
d2 %>%
mutate(
# 2000 variables
p.white.00 = 100 * nhwht00 / pop00.x,
p.black.00 = 100 * nhblk00 / pop00.x,
p.hisp.00 = 100 * hisp00 / pop00.x,
p.asian.00 = 100 * asian00 / pop00.x,
p.hs.edu.00 = 100 * (hs00+col00) / ag25up00,
p.col.edu.00 = 100 * col00 / ag25up00,
p.prof.00 = 100 * prof00 / empclf00,
p.unemp.00 = 100 * unemp00 / clf00,
pov.rate.00 = 100 * npov00 / dpov00,
# 2010 variables
p.white.10 = 100 * nhwht10 / pop10,
p.black.10 = 100 * nhblk10 / pop10,
p.hisp.10 = 100 * hisp10 / pop10,
p.asian.10 = 100 * asian10 / pop10,
p.hs.edu.10 = 100 * (hs12+col12) / ag25up12,
p.col.edu.10 = 100 * col12 / ag25up12,
p.prof.10 = 100 * prof12 / empclf12,
p.unemp.10 = 100 * unemp12 / clf12,
pov.rate.10 = 100 * npov12 / dpov12 )
d2 <-
d2 %>%
group_by( cbsaname ) %>%
mutate( metro.mhv.pct.00 = ntile( mhv.00, 100 ),
metro.mhv.pct.10 = ntile( mhv.10, 100 ),
metro.median.pay.00 = median( hinc00, na.rm=T ),
metro.median.pay.10 = median( hinc12, na.rm=T ),
metro.race.rank.00 = ntile( (100-p.white.00), 100 ) ) %>%
ungroup() %>%
mutate( metro.mhv.pct.change = metro.mhv.pct.10 - metro.mhv.pct.00,
pay.change = metro.median.pay.10 - metro.median.pay.00,
race.change = p.white.10 - p.white.00,
mhv.change = mhv.10 - mhv.00 )
Descriptive Statistics of Change Variables
d3 <-
d2 %>%
select( c( "tractid", "cbsa", "cbsaname",
"mhv.00", "mhv.10", "mhv.change","pct.change",
"p.white.00", "p.black.00", "p.hisp.00", "p.asian.00",
"p.hs.edu.00", "p.col.edu.00", "p.prof.00", "p.unemp.00",
"pov.rate.00", "p.white.10", "p.black.10", "p.hisp.10",
"p.asian.10", "p.hs.edu.10", "p.col.edu.10", "p.prof.10",
"p.unemp.10", "pov.rate.10", "metro.mhv.pct.00",
"metro.mhv.pct.10", "metro.median.pay.00", "metro.median.pay.10",
"metro.mhv.pct.change", "pay.change", "race.change",
"metro.race.rank.00") )
# head( d3 ) %>% pander()
d3 <- data.frame(d3)
stargazer( d3,
type=s.type,
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )
Statistic | Min | Pctl(25) | Median | Mean | Pctl(75) | Max |
mhv.00 | 11,167 | 105,661 | 154,903 | 187,129 | 224,337 | 1,288,551 |
mhv.10 | 9,999 | 123,200 | 193,200 | 246,570 | 312,000 | 1,000,001 |
mhv.change | -1,228,651 | 7,101 | 35,981 | 59,372 | 94,140 | 983,765 |
pct.change | -97 | 6 | 25 | 33 | 50 | 6,059 |
p.white.00 | 0 | 47 | 78 | 67 | 91 | 100 |
p.black.00 | 0 | 1 | 4 | 14 | 14 | 100 |
p.hisp.00 | 0 | 2 | 4 | 13 | 15 | 100 |
p.asian.00 | 0 | 1 | 2 | 5 | 5 | 95 |
p.hs.edu.00 | 0 | 67 | 72 | 72 | 77 | 100 |
p.col.edu.00 | 0 | 12 | 21 | 26 | 36 | 100 |
p.prof.00 | 0 | 23 | 31 | 34 | 43 | 100 |
p.unemp.00 | 0 | 3 | 5 | 6 | 8 | 100 |
pov.rate.00 | 0 | 4 | 9 | 12 | 17 | 100 |
p.white.10 | 0 | 37 | 70 | 60 | 87 | 100 |
p.black.10 | 0 | 2 | 5 | 15 | 17 | 100 |
p.hisp.10 | 0 | 3 | 7 | 17 | 21 | 100 |
p.asian.10 | 0 | 1 | 3 | 6 | 6 | 96 |
p.hs.edu.10 | 0 | 66 | 71 | 71 | 77 | 100 |
p.col.edu.10 | 0 | 15 | 25 | 30 | 41 | 100 |
p.prof.10 | 0 | 24 | 34 | 35 | 46 | 100 |
p.unemp.10 | 0 | 6 | 9 | 10 | 13 | 100 |
pov.rate.10 | 0 | 6 | 12 | 16 | 22 | 100 |
metro.mhv.pct.00 | 1 | 20 | 41 | 45 | 68 | 100 |
metro.mhv.pct.10 | 1 | 20 | 41 | 45 | 68 | 100 |
metro.median.pay.00 | 23,012 | 39,457 | 43,139 | 45,054 | 49,522 | 73,701 |
metro.median.pay.10 | 30,337 | 47,736 | 54,728 | 55,905 | 60,119 | 96,033 |
metro.mhv.pct.change | -99 | -6 | -1 | 0 | 4 | 99 |
pay.change | -3,207 | 7,495 | 9,764 | 10,851 | 13,809 | 27,310 |
race.change | -100 | -10 | -5 | -6 | -1 | 90 |
metro.race.rank.00 | 1 | 20 | 41 | 45 | 68 | 100 |
Which definition did you select for gentrification, and how would you operationalize it?
# income
# percent white
# home values absolute
# home value relative to metro
# education stats ?
# employment stats ?
# income stats ?
# growth of pop per tract (density) ?
# home value in lower than average home in a metro in 2000
poor.2000 <- d3$metro.mhv.pct.00 < 50
# above average diversity for metro
diverse.2000 <- d3$metro.race.rank.00 > 50
# home values increased more than overall city gains
# change in percentile rank within the metro
mhv.pct.increase <- d3$metro.mhv.pct.change > 0
# faster than average growth
# 25% growth in value is median for the country
home.val.rise <- d3$pct.change > 25
# proportion of whites increases by more than 3 percent
# measured by increase in white
loss.diversity <- d3$race.change > 3
g.flag <- poor.2000 & diverse.2000 & mhv.pct.increase & home.val.rise & loss.diversity
num.candidates <- sum( poor.2000 & diverse.2000, na.rm=T )
num.gentrified <- sum( g.flag, na.rm=T )
num.gentrified
## [1] 1137
## [1] 18219
## [1] 0.06240738
By this definition only 5.7 percent of urban tracts experience gentrification between 2000 and 2010.
This might skew numbers?
# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?
mhv.00[ mhv.00 < 1000 ] <- NA
pct.change <- 100 * ( mhv.change / mhv.00 )
summary( pct.change )
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -96.891 5.918 25.402 33.167 49.556 6059.261 220
What do you think? Is that the right way to operationalize it? Do you care about relative diversity (more diverse neighborhood than rest of the city) or absolute (percentage of non-whites regardless of city diversity).
Do we care about absolute increase in value, or relative to the national average? The national average would include all of the gentrifying tracts, which could skew it upward and thus make it a poor benchmark? Maybe look at the average increase in value for non-gentrification candadidates?
Dust off your GIS skills from CPP 529 so that you can visualize some of your metrics.
You will need to pick one or more cities that you can use as examples in your report. It is strongly recommended that you create a dorling cartogram for reporting since Census tracts introduce visual bias by over-empasizing lower density tracts and hiding a lot of the data where the greatest number of people reside in the city. Recall that Dorling cartograms correct for this by re-sizing administrative units proportion to the size of the population they represent.
The lab from CPP 528 that covers creating Dorling cartograms is locate here:
https://ds4ps.org/cpp-529-master/labs/lab-04-instructions.html
# devtools::install_github( "sjewo/cartogram" )
# install.packages( "tmap" )
library( geojsonio ) # read geoJSON map files from GitHub
library( sp ) # spatial data class sp for shapefiles
library( cartogram ) # spatial maps w/ tract size bias reduction
library( tmap ) # thematic maps
library( maptools ) # spatial object manipulation
library( sf ) # 'simple features' flavor of shapefiles
# we have phoenix already packaged and on GitHub for easy load:
github.url <- "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/phx_dorling.geojson"
phx <- geojson_read( x=github.url, what="sp" )
plot( phx )
# create small dataframe for the merge
df <- data.frame( tractid=d$tractid,
mhv.00, mhv.10, mhv.change, pct.change )
# create GEOID that matches GIS format
# create a geoID for merging by tract
df$GEOID <- substr( df$tractid, 6, 18 ) # extract codes
df$GEOID <- gsub( "-", "", df$GEOID ) # remove hyphens
class( df$GEOID )
## [1] "character"
## [1] "01001020100" "01001020200" "01001020300" "01001020400" "01001020500"
## [6] "01001020600"
Note the current class of the PHX shapefile IDs:
You can either convert the data frame ID to numeric to drop the leading zero and merge with GEOID2.
Or keep it as a character vector and merge with GEOID.
## [1] 913
phx <- merge( phx, df, by.x="GEOID", by.y="GEOID" )
# make sure they have not changed or
# you are missing rows in your data frame
# or merging with the wrong ID
nrow( phx )
## [1] 913
phx <- spTransform( phx, CRS("+init=epsg:3395") )
bb <- st_bbox( c( xmin = -12519146, xmax = -12421368,
ymax = 3965924, ymin = 3899074 ),
crs = st_crs("+init=epsg:3395"))
tm_shape( phx, bbox=bb ) +
tm_polygons( col="mhv.00", n=10, style="quantile", palette="Spectral" ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )
tm_shape( phx, bbox=bb ) +
tm_polygons( col="mhv.change", n=10, style="quantile", palette="Spectral" ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )
tm_shape( phx, bbox=bb ) +
tm_polygons( col="pct.change", n=10, style="quantile", palette="Spectral" ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )