# load all data as character vecs
d.2010.samp <- read.csv( "data/raw/ltdb_std_2010_sample.csv", colClasses="character" )
str( d.2010.samp[1:10] )
## 'data.frame': 73056 obs. of 10 variables:
## $ tractid : chr "1001020100" "1001020200" "1001020300" "1001020400" ...
## $ statea : chr "01" "01" "01" "01" ...
## $ countya : chr "001" "001" "001" "001" ...
## $ tracta : chr "020100" "020200" "020300" "020400" ...
## $ pnhwht12: chr "85.31999969" "37.02000046" "79.77999878" "92.59999847" ...
## $ pnhblk12: chr "11.52999973" "56.27000046" "17.14999962" "1.450000048" ...
## $ phisp12 : chr "0" "2.519999981" "1.769999981" "2.630000114" ...
## $ pntv12 : chr "0.170000002" "0" "0" "0.829999983" ...
## $ pasian12: chr "0" "2.839999914" "1.080000043" "0" ...
## $ phaw12 : chr "0" "0" "0" "0" ...
Check summary stats:
## [1] "-999" "0" "0" "9.649999619" "0"
## [6] "6.440000057"
## [1] 3874
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -999.00 0.00 1.76 -46.81 7.35 100.00
We have problems with missing data coded as -999, which will cause issues with any analysis.
Remove missing value codes “-999” and replace with variable mean or NAs.
# convert variables to numeric
# and remove missing values placeholders;
# inpute missing values with mean
clean_x <- function( x )
{
x <- as.numeric( x )
x[ x == -999 ] <- NA
mean.x <- mean( x, na.rm=T )
x[ is.na(x) ] <- mean.x
return(x)
}
# apply the clean var x function to all columns
clean_d <- function( d, start.column )
{
# d <- fix_names( d )
these <- start.column:ncol(d)
d[ these ] <- lapply( d[ these ], clean_x )
return( d )
}
Test the code:
# first four columns are unique IDs - leave them as character vectors
d.2010.samp <- clean_d( d.2010.samp, start.column=5 )
str( d.2010.samp[1:10] )
## 'data.frame': 73056 obs. of 10 variables:
## $ tractid : chr "1001020100" "1001020200" "1001020300" "1001020400" ...
## $ statea : chr "01" "01" "01" "01" ...
## $ countya : chr "001" "001" "001" "001" ...
## $ tracta : chr "020100" "020200" "020300" "020400" ...
## $ pnhwht12: num 85.3 37 79.8 92.6 75.3 ...
## $ pnhblk12: num 11.53 56.27 17.15 1.45 18.1 ...
## $ phisp12 : num 0 2.52 1.77 2.63 2.53 ...
## $ pntv12 : num 0.17 0 0 0.83 0.18 ...
## $ pasian12: num 0 2.84 1.08 0 2.41 ...
## $ phaw12 : num 0 0 0 0 0 0 0 0 0 0 ...
Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. |
---|---|---|---|---|---|
0 | 0 | 2.81 | 6.512 | 7.35 | 100 |
That works!
We want to standardize datasets across all of the years so that they are all clean, have the same structure, same variable name conventions, etc.
# FIX VARIABLE NAMES
# input dataframe
# standardize variable names
# output data frame with fixed names
fix_names <- function( d )
{
nm <- names( d )
nm <- tolower( nm )
nm[ nm == "statea" ] <- "state"
nm[ nm == "countya" ] <- "county"
nm[ nm == "tracta" ] <- "tract"
nm[ nm == "trtid10" ] <- "tractid"
nm[ nm == "mar-70" ] <- "mar70"
nm[ nm == "mar-80" ] <- "mar80"
nm[ nm == "mar-90" ] <- "mar90"
nm[ nm == "mar.00" ] <- "mar00"
nm[ nm == "x12.mar" ] <- "mar12"
nm <- gsub( "sp1$", "", nm )
nm <- gsub( "sp2$", "", nm )
nm <- gsub( "sf3$", "", nm )
nm <- gsub( "sf4$", "", nm )
# nm <- gsub( "[0-9]{2}$", "", nm )
names( d ) <- nm
return( d )
}
# FIX TRACT IDS
# put into format: SS-CCC-TTTTTT
fix_ids <- function( x )
{
x <- stringr::str_pad( x, 11, pad = "0" )
state <- substr( x, 1, 2 )
county <- substr( x, 3, 5 )
tract <- substr( x, 6, 11 )
x <- paste( "fips", state, county, tract, sep="-" )
return(x)
}
tidy_up_data <- function( file.name )
{
path <- paste0( "data/raw/", file.name )
d <- read.csv( path, colClasses="character" )
type <- ifelse( grepl( "sample", file.name ), "sample", "full" )
year <- substr( file.name, 10, 13 )
# fix names
d <- fix_names( d )
# fix leading zero problem in tract ids
d$tractid <- fix_ids( d$tractid )
# drop meta-vars
drop.these <- c("state", "county", "tract", "placefp10",
"cbsa10", "metdiv10", "ccflag10",
"globd10", "globg10","globd00", "globg00",
"globd90", "globg90","globd80", "globg80")
d <- d[ ! names(d) %in% drop.these ]
# column position where variables start after IDs
d <- clean_d( d, start.column=2 )
# add year and type (sample/full)
d <- data.frame( year, type, d, stringsAsFactors=F )
return( d )
}
Test code:
file.name <- "ltdb_std_2010_sample.csv"
d.2010.s <- tidy_up_data( file.name )
head( d.2010.s[1:20] ) %>% pander()
file.name <- "LTDB_Std_2010_fullcount.csv"
d.2010.f <- tidy_up_data( file.name )
head( d.2010.f[1:20] ) %>% pander()
d2 <- bind_rows( d.2010.s, d.2010.f )
file.name <- "ltdb_std_2000_sample.csv"
d.2010.s <- tidy_up_data( file.name )
head( d.2010.s[1:20] ) %>% pander()
file.name <- "LTDB_Std_2000_fullcount.csv"
d.2010.f <- tidy_up_data( file.name )
head( d.2010.f[1:20] ) %>% pander()
d2 <- bind_rows( d.2010.s, d.2010.f )
Clean and tidy all data from the same year, then combine sample and full dataframes into a single table.
build_year <- function( fn1, fn2, year )
{
d1 <- tidy_up_data( fn1 )
d1 <- select( d1, - type )
d2 <- tidy_up_data( fn2 )
d2 <- select( d2, - type )
d3 <- merge( d1, d2, by=c("year","tractid"), all=T )
file.name <- paste0( "data/rodeo/LTDB-", year, ".rds" )
saveRDS( d3, file.name )
}
year <- 1970
f1 <- "LTDB_Std_1970_fullcount.csv"
f2 <- "ltdb_std_1970_sample.csv"
build_year( fn1=f1, fn2=f2, year=year )
year <- 1980
f1 <- "LTDB_Std_1980_fullcount.csv"
f2 <- "ltdb_std_1980_sample.csv"
build_year( fn1=f1, fn2=f2, year=year )
year <- 1990
f1 <- "LTDB_Std_1990_fullcount.csv"
f2 <- "ltdb_std_1990_sample.csv"
build_year( fn1=f1, fn2=f2, year=year )
year <- 2000
f1 <- "LTDB_Std_2000_fullcount.csv"
f2 <- "ltdb_std_2000_sample.csv"
build_year( fn1=f1, fn2=f2, year=year )
year <- 2010
f1 <- "LTDB_Std_2010_fullcount.csv"
f2 <- "ltdb_std_2010_sample.csv"
build_year( fn1=f1, fn2=f2, year=year )
Check a file:
year | tractid | pop00.x | nhwht00 | nhblk00 | ntv00 | asian00 |
---|---|---|---|---|---|---|
2000 | fips-01-001-020100 | 1921 | 1723 | 145 | 29 | 8 |
2000 | fips-01-001-020200 | 1892 | 671 | 1177 | 12 | 12 |
2000 | fips-01-001-020300 | 3339 | 2738 | 498 | 16 | 27 |
2000 | fips-01-001-020400 | 4556 | 4273 | 118 | 23 | 40 |
2000 | fips-01-001-020500 | 6054 | 5427 | 367.5 | 36.1 | 113.1 |
2000 | fips-01-001-020600 | 3272 | 2615 | 553.1 | 25.18 | 10.65 |
hisp00 | haw00 | india00 | china00 | filip00 | japan00 | korea00 | viet00 |
---|---|---|---|---|---|---|---|
12 | 0 | 4 | 0 | 1 | 1 | 2 | 0 |
16 | 0 | 0 | 1 | 3 | 1 | 6 | 0 |
55 | 1 | 0 | 3 | 3 | 8 | 2 | 1 |
101 | 0 | 6 | 5 | 7 | 13 | 8 | 4 |
95.24 | 0 | 5 | 17.01 | 21 | 20 | 31.02 | 10.01 |
63.93 | 0.9686 | 0.9686 | 0 | 0.9686 | 1.937 | 1.937 | 0.9686 |
mex00 | pr00 | cuban00 | hu00 | vac00 | ohu00 | a18und00 | a60up00 | a75up00 |
---|---|---|---|---|---|---|---|---|
4 | 2 | 0 | 769 | 93 | 676 | 519 | 260 | 69 |
11 | 1 | 3 | 731 | 67 | 664 | 530 | 282 | 103 |
29 | 16 | 0 | 1263 | 61 | 1202 | 960 | 594 | 229 |
43 | 32 | 0 | 1871 | 111 | 1760 | 1123 | 1009 | 244 |
35.09 | 28.06 | 2.005 | 2282 | 80.39 | 2202 | 1871 | 653.6 | 156.4 |
21.31 | 8.718 | 0 | 1310 | 139.5 | 1170 | 992.8 | 430.1 | 116.2 |
agewht00 | a15wht00 | a60wht00 | ageblk00 | a15blk00 | a60blk00 | agehsp00 |
---|---|---|---|---|---|---|
1723 | 403 | 245 | 141 | 31 | 13 | 12 |
671 | 156 | 120 | 1163 | 302 | 158 | 16 |
2738 | 691 | 499 | 491 | 132 | 84 | 55 |
4273 | 911 | 982 | 117 | 39 | 6 | 101 |
5427 | 1466 | 630.2 | 358.5 | 121.5 | 8.318 | 95.24 |
2615 | 677.1 | 356.5 | 540.5 | 176.3 | 65.87 | 63.93 |
a15hsp00 | a60hsp00 | agentv00 | a15ntv00 | a60ntv00 | ageasn00 | a15asn00 |
---|---|---|---|---|---|---|
4 | 1 | 15 | 5 | 0 | 7 | 2 |
5 | 1 | 6 | 0 | 0 | 7 | 1 |
18 | 3 | 7 | 2 | 0 | 23 | 3 |
29 | 12 | 10 | 0 | 3 | 32 | 4 |
36.07 | 7.01 | 25.07 | 8.01 | 0.005055 | 72.05 | 13 |
14.53 | 4.843 | 11.62 | 1.937 | 0.9686 | 6.78 | 1.937 |
a60asn00 | family00 | fhh00 | own00 | rent00 | pop00.y | ruanc00 | itanc00 |
---|---|---|---|---|---|---|---|
1 | 532 | 59 | 518 | 158 | 1879 | 0 | 5 |
0 | 494 | 121 | 452 | 212 | 1934 | 0 | 39 |
6 | 920 | 118 | 869 | 333 | 3339 | 0 | 66 |
5 | 1376 | 102 | 1390 | 370 | 4556 | 12 | 59 |
8.01 | 1747 | 144.6 | 1671 | 531.3 | 6054 | 10 | 84.13 |
0.9686 | 904.7 | 123 | 960.9 | 209.2 | 3272 | 0 | 57.15 |
geanc00 | iranc00 | scanc00 | rufb00 | itfb00 | gefb00 | irfb00 | scfb00 |
---|---|---|---|---|---|---|---|
139 | 166 | 6 | 0 | 0 | 0 | 0 | 0 |
78 | 34 | 0 | 0 | 0 | 0 | 0 | 0 |
186 | 157 | 49 | 0 | 0 | 17 | 0 | 7 |
355 | 302 | 17 | 0 | 0 | 11 | 9 | 0 |
530.5 | 487.3 | 35.06 | 0 | 0 | 9.071 | 0 | 0 |
145.3 | 167.6 | 2.906 | 0 | 0 | 10.65 | 0 | 0 |
fb00 | nat00 | n10imm00 | ag5up00 | olang00 | lep00 | ag25up00 | hs00 | col00 |
---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1781 | 38 | 0 | 1227 | 635 | 192 |
0 | 0 | 0 | 1774 | 51 | 8 | 1157 | 740 | 170 |
68 | 45 | 13 | 3047 | 133 | 17 | 2130 | 990 | 478 |
66 | 46 | 13 | 4281 | 132 | 0 | 3072 | 1477 | 708 |
81.2 | 64.2 | 9 | 5601 | 126.4 | 33.03 | 3785 | 1257 | 1214 |
30.03 | 10.65 | 5.812 | 3032 | 142.4 | 56.18 | 1977 | 1179 | 317.7 |
ag15up00 | mar00 | wds00 | clf00 | unemp00 | dflabf00 | flabf00 | empclf00 |
---|---|---|---|---|---|---|---|
1469 | 961 | 237 | 872 | 46 | 752 | 384 | 826 |
1479 | 598 | 314 | 802 | 80 | 776 | 392 | 722 |
2503 | 1438 | 577 | 1456 | 42 | 1318 | 677 | 1414 |
3635 | 2402 | 542 | 2191 | 77 | 1839 | 1062 | 2114 |
4446 | 3155 | 608.7 | 2955 | 49.35 | 2272 | 1317 | 2906 |
2383 | 1337 | 553.1 | 1584 | 87.18 | 1247 | 747.8 | 1497 |
prof00 | manuf00 | semp00 | ag18cv00 | vet00 | cni16u00 | dis00 | dpov00 |
---|---|---|---|---|---|---|---|
221 | 74 | 68 | 1316 | 240 | 1196 | 276 | 1790 |
154 | 82 | 61 | 1398 | 219 | 1195 | 435 | 1907 |
438 | 144 | 50 | 2284 | 494 | 1907 | 340 | 3262 |
673 | 277 | 250 | 3354 | 730 | 2793 | 452 | 4551 |
1173 | 391.9 | 167.4 | 3997 | 683.5 | 3789 | 420.4 | 6048 |
364.2 | 234.4 | 139.5 | 2262 | 355.5 | 2004 | 425.2 | 3272 |
npov00 | n65pov00 | dfmpov00 | nfmpov00 | dwpov00 | nwpov00 | dbpov00 |
---|---|---|---|---|---|---|
227 | 11 | 551 | 47 | 1759 | 212 | 31 |
433 | 32 | 476 | 65 | 626 | 53 | 1249 |
250 | 45 | 937 | 36 | 2669 | 142 | 466 |
207 | 45 | 1385 | 25 | 4268 | 177 | 105 |
223.3 | 18.11 | 1747 | 45.24 | 5372 | 141.4 | 400.4 |
497.9 | 67.8 | 902.8 | 82.33 | 2592 | 253.8 | 535.6 |
nbpov00 | dnapov00 | nnapov00 | dhpov00 | nhpov00 | dapov00 | napov00 |
---|---|---|---|---|---|---|
15 | 0 | 0 | 0 | 0 | 0 | 0 |
367 | 0 | 0 | 0 | 0 | 0 | 0 |
102 | 14 | 0 | 93 | 6 | 16 | 0 |
0 | 6 | 0 | 63 | 30 | 65 | 0 |
34.88 | 31.12 | 0 | 49.3 | 23 | 88.09 | 24 |
214.1 | 0 | 0 | 30.03 | 0 | 8.718 | 0 |
incpc00 | hu00sp | h30old00 | ohu00sp | h10yrs00 | dmulti00 | multi00 | hinc00 |
---|---|---|---|---|---|---|---|
17771 | 742 | 225 | 660 | 444 | 742 | 19 | 36685 |
14217 | 758 | 329 | 680 | 311 | 758 | 36 | 30298 |
18346 | 1263 | 452 | 1202 | 897 | 1263 | 96 | 46731 |
19741 | 1871 | 979 | 1760 | 1037 | 1871 | 77 | 46142 |
24492 | 2282 | 152.5 | 2202 | 1784 | 2282 | 334.4 | 58886 |
16395 | 1310 | 450.4 | 1170 | 696.4 | 1310 | 34.87 | 33699 |
hincw00 | hincb00 | hinch00 | hinca00 | mhmval00 | mrent00 | hh00 | hhw00 |
---|---|---|---|---|---|---|---|
36957 | 23438 | 44200 | 59228 | 76600 | 339 | 717 | 704 |
40288 | 27938 | 44200 | 59228 | 72900 | 260 | 629 | 245 |
48977 | 30163 | 48611 | 87500 | 79900 | 449 | 1204 | 1003 |
46774 | 18611 | 80090 | 112500 | 89800 | 494 | 1750 | 1659 |
59322 | 45502 | 51289 | 5113 | 116594 | 558.8 | 2191 | 2037 |
37727 | 18819 | 44200 | 37500 | 70400 | 337 | 1161 | 920.2 |
hhb00 | hhh00 | hha00 |
---|---|---|
13 | 0 | 0 |
365 | 0 | 0 |
169 | 22 | 6 |
46 | 16 | 7 |
125.4 | 14.06 | 5.045 |
186 | 0 | 8.718 |
Metro areas are designated by the US Census as Core-Based Statistical Areas (CBSA).
“A core-based statistical area (CBSA) is a U.S. geographic area defined by the Office of Management and Budget (OMB) that consists of one or more counties (or equivalents) anchored by an urban center of at least 10,000 people plus adjacent counties that are socioeconomically tied to the urban center by commuting. Areas defined on the basis of these standards applied to Census 2000 data were announced by OMB in June 2003. These standards are used to replace the definitions of metropolitan areas that were defined in 1990. The OMB released new standards based on the 2010 Census.” cite
Note that these are defined as sets of counties, so the definition files are organized with one county per row, and attributes associated with the county.
Census data files do not always have info about metro areas. If we need this information for our analysis we can get a crosswalk file from the National Bureau of Economic Research:
https://data.nber.org/data/cbsa-msa-fips-ssa-county-crosswalk.html
URL <- "https://data.nber.org/cbsa-msa-fips-ssa-county-crosswalk/cbsatocountycrosswalk.csv"
cw <- read.csv( URL, colClasses="character" )
# all metro areas in the country
sort( unique( cw$cbsaname ) ) %>% head() %>% pander()
__, Abilene, TX, Aguadilla-Isabela-San Sebastián, PR, Akron, OH, Albany-Schenectady-Troy, NY and Albany, GA
There are 3,292 counties in 2010. Of these, 35% are urban, 65% are rural.
# note in the data dictionary for CBSA Name (copied below): “blanks are rural”
cw$urban <- ifelse( cw$cbsaname == "", "rural", "urban" )
table( cw$urban ) %>% pander()
rural | urban |
---|---|
2130 | 1163 |
keep.these <- c( "countyname","state","fipscounty",
"msa","msaname",
"cbsa","cbsaname",
"urban" )
cw <- dplyr::select( cw, keep.these )
head( cw ) %>% pander()
countyname | state | fipscounty | msa | msaname | cbsa |
---|---|---|---|---|---|
AUTAUGA | AL | 01001 | 5240 | MONTGOMERY, AL | 33860 |
BALDWIN | AL | 01003 | 5160 | MOBILE, AL | |
BARBOUR | AL | 01005 | 01 | ALABAMA | |
BIBB | AL | 01007 | 01 | ALABAMA | 13820 |
BLOUNT | AL | 01009 | 1000 | BIRMINGHAM, AL | 13820 |
BULLOCK | AL | 01011 | 01 | ALABAMA |
cbsaname | urban |
---|---|
Montgomery, AL | urban |
rural | |
rural | |
Birmingham-Hoover, AL | urban |
Birmingham-Hoover, AL | urban |
rural |
Save for easy load:
It’s not technically not strictly raw data because we created a new variable and dropped some columns, but it’s input data we are grabbing from an external site as meta-data, and it will not be a final research dataset used for analysis, so we can put it into the raw folder.
# DATA DICTIONARY FOR CROSSWALK
1. cbsatocountycrosswalk2005 set up by Jean Roth , jroth@nber.org , 20 Dec 2016
2. Source: fr05_cbsa_msa_xwalk_pub.txt
3. NBER URL: http://www.nber.org/data/cbsa-msa-fips-ssa-county-crosswalk.html
4. Source Page: http://www.cms.gov/Medicare/Medicare-Fee-for-Service-Payment/AcuteInpatientPPS/Acute-Inpatient-Files-for-Download-Items/CMS022637.html
5. Source File URL: http://www.cms.gov/Medicare/Medicare-Fee-for-Service-Payment/AcuteInpatientPPS/Downloads/fr05_cbsa_msa_xwalk_pub.zip
6. by Jean Roth , jroth@nber.org , 28 Nov 2016
ssacounty:
1. Los Angeles FIPS 06037 can have two SSA county codes: 05210 and 05200
obs: 3,293
vars: 21 20 Dec 2016 11:41
size: 757,390 (_dta has notes)
-----------------------------------------------------------------------------------------------------------
storage display value
variable name type format label variable label
-----------------------------------------------------------------------------------------------------------
countyname str26 %26s County Name
state str2 %9s State
ssacounty str5 %9s * SSA County Code
fipscounty str5 %9s FIPS County Code
msa str6 %9s Old MSA
l str1 %9s Lugar
msaname str48 %48s Old MSA Name
cbsa str5 %9s CBSA - if blank then rural area (set equal to first 2 digits of ssa code)
cbsaname str50 %50s CBSA Name
cbsaold long %12.0g (Blanks are Rural)
cbsanameold str42 %42s (Blanks are Rural)
ssast str2 %9s SSA State code
fipst str2 %9s FIPS State code
y2005 float %9.0g Present in 2005 source file
y2011 float %9.0g Present in 2011 source file
y2012 float %9.0g Present in 2012 source file
y2013 float %9.0g Present in 2013 source file
y2014 float %9.0g Present in 2014 source file
y2015 float %9.0g Present in 2015 source file
y2016 float %9.0g Present in 2016 source file
y2017 float %9.0g Present in 2017 source file
* indicated variables have notes
------------------------------------------------------------------------------------------------------------
Sorted by: fipscounty ssacounty
Each of the file contains redundant meta-data. We can remove it to make merges easier, and consolidate all of the meta-data (attributes of counties and census tracts) into a single file for ease of use.
We need one per year from 1980 to 2000 to grab all of the unique meta-data in the files.
extract_metadata <- function( file.name )
{
path <- paste0( "data/raw/", file.name )
d <- read.csv( path, colClasses="character" )
type <- ifelse( grepl( "sample", file.name ), "sample", "full" )
year <- substr( file.name, 10, 13 )
# fix names
d <- fix_names( d )
# fix leading zero problem in tract ids
d$tractid <- fix_ids( d$tractid )
# drop meta-vars
keep.these <- c("tractid","state", "county", "tract", "placefp10",
"cbsa10", "metdiv10", "ccflag10",
"globd10", "globg10","globd00", "globg00",
"globd90", "globg90","globd80", "globg80")
d <- d[ names(d) %in% keep.these ]
return( d )
}
f.1970 <- "LTDB_Std_1970_fullcount.csv"
f.1980 <- "LTDB_Std_1980_fullcount.csv"
f.1990 <- "LTDB_Std_1990_fullcount.csv"
f.2000 <- "LTDB_Std_2000_fullcount.csv"
meta.d.2000 <- extract_metadata( file.name=f.2000 )
meta.d.1990 <- extract_metadata( file.name=f.1990 )
meta.d.1990 <- select( meta.d.1990, tractid, globd90, globg90 )
meta.d.1980 <- extract_metadata( file.name=f.1980 )
meta.d.1980 <- select( meta.d.1980, tractid, globd80, globg80 )
meta.d <- merge( meta.d.2000, meta.d.1990, all=T )
meta.d <- merge( meta.d, meta.d.1980, all=T )
meta.d$fipscounty <- paste0( substr( meta.d$tractid, 6, 7 ),
substr( meta.d$tractid, 9, 11 ) )
head( meta.d ) %>% pander()
tractid | state | county | tract | placefp10 |
---|---|---|---|---|
fips-01-001-020100 | AL | Autauga County | Census Tract 201 | 62328 |
fips-01-001-020200 | AL | Autauga County | Census Tract 202 | 62328 |
fips-01-001-020300 | AL | Autauga County | Census Tract 203 | 62328 |
fips-01-001-020400 | AL | Autauga County | Census Tract 204 | 62328 |
fips-01-001-020500 | AL | Autauga County | Census Tract 205 | 62328 |
fips-01-001-020600 | AL | Autauga County | Census Tract 206 | 62328 |
cbsa10 | metdiv10 | ccflag10 | globd00 | globg00 | globd90 | globg90 |
---|---|---|---|---|---|---|
33860 | 99999 | 0 | bw | White Black | w | White |
33860 | 99999 | 0 | bw | White Black | bw | White Black |
33860 | 99999 | 0 | bw | White Black | bw | White Black |
33860 | 99999 | 0 | w | White | w | White |
33860 | 99999 | 0 | bw | White Black | w | White |
33860 | 99999 | 0 | bw | White Black | bw | White Black |
globd80 | globg80 | fipscounty |
---|---|---|
w | White | 01001 |
bw | White Black | 01001 |
bw | White Black | 01001 |
w | White | 01001 |
w | White | 01001 |
bw | White Black | 01001 |
Load the CBSA crosswalk:
countyname | state | fipscounty | msa | msaname | cbsa |
---|---|---|---|---|---|
AUTAUGA | AL | 01001 | 5240 | MONTGOMERY, AL | 33860 |
BALDWIN | AL | 01003 | 5160 | MOBILE, AL | |
BARBOUR | AL | 01005 | 01 | ALABAMA | |
BIBB | AL | 01007 | 01 | ALABAMA | 13820 |
BLOUNT | AL | 01009 | 1000 | BIRMINGHAM, AL | 13820 |
BULLOCK | AL | 01011 | 01 | ALABAMA |
cbsaname | urban |
---|---|
Montgomery, AL | urban |
rural | |
rural | |
Birmingham-Hoover, AL | urban |
Birmingham-Hoover, AL | urban |
rural |
cw <- select( cw, -countyname, -state )
# new counties since 2010 ?
setdiff( cw$fipscounty, meta.d$fipscounty )
## [1] "01990" "02031" "02040" "02080" "02120" "02140" "02160" "02190" "02200"
## [10] "02201" "02210" "02231" "02232" "02250" "02260" "02280" "02990" "04990"
## [19] "05990" "06990" "08990" "09990" "10990" "12990" "13990" "15990" "16990"
## [28] "17990" "18990" "19990" "20990" "21990" "22990" "23990" "24990" "25990"
## [37] "26990" "27990" "28990" "29990" "30113" "30990" "31990" "32990" "33990"
## [46] "34990" "35990" "36990" "37990" "38990" "39990" "40990" "41990" "42990"
## [55] "44999" "45990" "46131" "46990" "47990" "48990" "49990" "50990" "51560"
## [64] "51695" "51780" "51990" "53990" "54990" "55990" "56990" "72001" "72003"
## [73] "72005" "72007" "72009" "72011" "72013" "72015" "72017" "72019" "72021"
## [82] "72023" "72025" "72027" "72029" "72031" "72033" "72035" "72037" "72039"
## [91] "72041" "72043" "72045" "72047" "72049" "72051" "72053" "72054" "72055"
## [100] "72057" "72059" "72061" "72063" "72065" "72067" "72069" "72071" "72073"
## [109] "72075" "72077" "72079" "72081" "72083" "72085" "72087" "72089" "72091"
## [118] "72093" "72095" "72097" "72099" "72101" "72103" "72105" "72107" "72109"
## [127] "72111" "72113" "72115" "72117" "72119" "72121" "72123" "72125" "72127"
## [136] "72129" "72131" "72133" "72135" "72137" "72139" "72141" "72143" "72145"
## [145] "72147" "72149" "72151" "72153" "72990"
## [1] 3293
## [1] 3292
## [1] 72693
## [1] 72693
fipscounty | tractid | state | county | tract |
---|---|---|---|---|
01001 | fips-01-001-020100 | AL | Autauga County | Census Tract 201 |
01001 | fips-01-001-020200 | AL | Autauga County | Census Tract 202 |
01001 | fips-01-001-020300 | AL | Autauga County | Census Tract 203 |
01001 | fips-01-001-020400 | AL | Autauga County | Census Tract 204 |
01001 | fips-01-001-020500 | AL | Autauga County | Census Tract 205 |
01001 | fips-01-001-020600 | AL | Autauga County | Census Tract 206 |
placefp10 | cbsa10 | metdiv10 | ccflag10 | globd00 | globg00 | globd90 |
---|---|---|---|---|---|---|
62328 | 33860 | 99999 | 0 | bw | White Black | w |
62328 | 33860 | 99999 | 0 | bw | White Black | bw |
62328 | 33860 | 99999 | 0 | bw | White Black | bw |
62328 | 33860 | 99999 | 0 | w | White | w |
62328 | 33860 | 99999 | 0 | bw | White Black | w |
62328 | 33860 | 99999 | 0 | bw | White Black | bw |
globg90 | globd80 | globg80 | msa | msaname | cbsa |
---|---|---|---|---|---|
White | w | White | 5240 | MONTGOMERY, AL | 33860 |
White Black | bw | White Black | 5240 | MONTGOMERY, AL | 33860 |
White Black | bw | White Black | 5240 | MONTGOMERY, AL | 33860 |
White | w | White | 5240 | MONTGOMERY, AL | 33860 |
White | w | White | 5240 | MONTGOMERY, AL | 33860 |
White Black | bw | White Black | 5240 | MONTGOMERY, AL | 33860 |
cbsaname | urban |
---|---|
Montgomery, AL | urban |
Montgomery, AL | urban |
Montgomery, AL | urban |
Montgomery, AL | urban |
Montgomery, AL | urban |
Montgomery, AL | urban |
Save for easy load:
Build one large stacked dataset:
Hard to use because you don’t know which panel years exist for each variable.
d.list <- NULL
loop.count <- 1
for( i in these )
{
file.name <- i
d.i <- tidy_up_data( file.name )
d.list[[ loop.count ]] <- d.i
loop.count <- loop.count + 1
}
d <- bind_rows( d.list )
Then you can reshape the dataset as needed:
dat <- filter( dat, year %in% c(2000,2010) )
library(data.table) # CRAN version 1.10.4
setDT(world) # coerce to data.table
data_wide <- dcast(world, Country ~ Year,
value.var = c("Growth", "Unemployment", "Population"))
reshape(world, direction = "wide", timevar = "Year", idvar = "Country")
d2 <- d[1:20]
reshape( d2, direction="wide", timevar="year", idvar="tractid" )
+---------+------+--------+--------------+------------+
| Country | Year | Growth | Unemployment | Population |
+---------+------+--------+--------------+------------+
| A | 2015 | 2 | 8.3 | 40 |
| B | 2015 | 3 | 9.2 | 32 |
| C | 2015 | 2.5 | 9.1 | 30 |
| D | 2015 | 1.5 | 6.1 | 27 |
| A | 2016 | 4 | 8.1 | 42 |
| B | 2016 | 3.5 | 9 | 32.5 |
| C | 2016 | 3.7 | 9 | 31 |
| D | 2016 | 3.1 | 5.3 | 29 |
| A | 2017 | 4.5 | 8.1 | 42.5 |
| B | 2017 | 4.4 | 8.4 | 33 |
| C | 2017 | 4.3 | 8.5 | 30 |
| D | 2017 | 4.2 | 5.2 | 30 |
+---------+------+--------+--------------+------------+
+---------+-------------+-------------------+-----------------+-------------+-------------------+-----------------+
| Country | Growth_2015 | Unemployment_2015 | Population_2015 | Growth_2016 | Unemployment_2016 | Population_2016 |
+---------+-------------+-------------------+-----------------+-------------+-------------------+-----------------+
| A | 2 | 8.3 | 40 | 4 | 8.1 | 42 |
| B | 3 | 9.2 | 32 | 3.5 | 9 | 32.5 |
| C | 2.5 | 9.1 | 30 | 3.7 | 9 | 31 |
| D | 1.5 | 6.1 | 27 | 3.1 | 5.3 | 29 |
+---------+-------------+-------------------+-----------------+-------------+-------------------+-----------------+