#
# ECON 549: NAFTA's Impact on Manufacturing
# Author: Nathan Hattersley
# Email: nate.hattersley@gmail.com
#
# This code creates the datasets, linear models,
# and graphs used in my paper. Raw data is downloaded
# from the internet, and package dependencies are
# detected and installed automatically. It should
# "just run." Email me if it doesn't "just run" or 
# if you want to know more about what it's doing.




# download missing packages
# load all dependencies
list.of.packages <- c("ggplot2", "dplyr","magrittr","lmtest","sandwich","latex2exp","boot")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
invisible(lapply(list.of.packages, require, character.only = TRUE))
rm(list.of.packages, new.packages)


## load the two data sets from csv
usaTariffs <- read.csv(url("http://www.math.arizona.edu/~nathanhattersley/tariffdata.csv")) %>% filter(Reporter.Name == "United States")
usaManufacturing <- read.csv(url("http://www.nber.org/nberces/nberces5811/sic5811.csv"))


## perform a left join:
## retain every entry from tariffs,
## try to match with an entry from manufacturing
usa <- usaTariffs %>% left_join(usaManufacturing, by = c("Product"="sic","Tariff.Year"="year"))
usa <- usa[!is.na(usa$emp),]
## remove the big data sets from RAM
rm(usaTariffs, usaManufacturing)


## calculate logSkill and logCap
within(usa, {
  logSkill <- log(1-prode/emp)
  logCap   <- log(cap/emp)
}) -> usa



## Calculate employment differential:
## Put in a new data frame
## left join later
empdiffTable <- Reduce(
  x=levels(factor(usa$Product)),
  f=function(current,sic) {
    industry.data <- usa %>% filter(Product==sic) # get data for every year for this industry
    out <- data.frame(Product=character(0), Tariff.Year = character(0), empdiff = numeric(0))
    for(year in 1991:2000) {
      # for each year, try and find employment for year i and year i-1
      empi_0 <- industry.data %>% filter(Tariff.Year == (year-1)) %>% select(emp) %>%  `[`(1,)
      empi_1 <- industry.data %>% filter(Tariff.Year == year) %>% select(emp) %>%  `[`(1,)
      if(all(length(empi_0) == 1,length(empi_1) == 1, !is.na(empi_1), !is.na(empi_0))) {
        out %<>% rbind(
          data.frame(Product=as.character(sic), Tariff.Year=as.character(year), empDiff=(empi_1/empi_0 - 1))
        ) # successful for this year, add it to our data frame to return
      }
    }
    rbind(current,out)
  },
  init=data.frame(Product=character(0), Tariff.Year = character(0), empDiff = numeric(0))
)



## join empdiffTable to our main data frame
## change column types to make it work (still gives warnings...)
usa$Product %<>% as.character()
usa$Tariff.Year %<>% as.character()
empdiffTable$Product %<>% as.character()
usa %<>% left_join(empdiffTable, by=c("Product","Tariff.Year"))
usa$Product %<>% as.factor()
usa$Tariff.Year %<>% as.factor()



## Pull out years I'll use in the model
usaReg <- usa %>% filter(Tariff.Year %in% c(1993,1997))




## Prepare the data for modeling:
## extract industries with valid data for both years
## calculate tariff differential
## output new data.frame for modeling (usaReg.clean)
Reduce(
  x = levels(usaReg$Product),
  f = function(current, sic) {
    
    # pull out variables of interest for both years
    # 1997 has two different tariff types
    sic93 <- usaReg %>% 
      filter(Product == sic & Tariff.Year == 1993) %>% 
      select(Product,Product.Name,Tariff.Year,logSkill,logCap, Simple.Average, empDiff)
    
    ahs97 <- usaReg %>% 
      filter(Product == sic & Tariff.Year == 1997 & DutyType == "AHS") %>%
      select(Product,Product.Name,Tariff.Year,logSkill,logCap, Simple.Average, empDiff)
    
    prf97 <- usaReg %>% filter(Product == sic & Tariff.Year == 1997 & DutyType == "PRF") %>%
      select(Product,Product.Name,Tariff.Year,logSkill,logCap, Simple.Average)
    
    
    # if all the data is there:
    # calculate tariff differential
    # add one entry for 93 and one for 97
    if (all(nrow(sic93) >= 1,
            nrow(ahs97) == 1,
            nrow(prf97) == 1,
            !is.na(prf97),
            !is.na(ahs97),
            !is.na(sic93))) {
      tDiff <- (sic93$Simple.Average - prf97$Simple.Average) / 100
      sic93 <- sic93[1,]
      sic93$tarDiff <- tDiff
      sic93$dYear <- 0
      ahs97$tarDiff <- tDiff
      ahs97$dYear <- 1
      out <- rbind(sic93,ahs97) %>%
        select(Product,
               Product.Name,
               Tariff.Year,
               empDiff,
               logSkill,
               logCap,
               tarDiff,
               dYear)
      return(rbind(current, out))
    }
    else # if some data is missing, return nothing
      return(current)
  },
  init = data.frame(
    Product = character(0),
    Product.Name = character(0),
    Tariff.Year = character(0),
    empDiff = numeric(0),
    logSkill = numeric(0),
    logCap = numeric(0),
    tarDiff = numeric(0),
    dYear = integer(0)
  )
) -> usaReg.clean



## change column types again
usaReg.clean$Product %<>% as.factor()
usaReg.clean$Product.Name %<>% as.factor()
usaReg.clean$Tariff.Year %<>% as.factor()



## Classify each two-digit SIC industry:
## Create a data frame with
## two digit sic codes and
## corresponding industry name
"20 Food & Kindred Products
21 Tobacco Products
22 Textile Mill Products
23 Apparel & Other Textile Products
24 Lumber & Wood Products
25 Furniture & Fixtures
26 Paper & Allied Products
27 Printing & Publishing
28 Chemical & Allied Products
29 Petroleum & Coal Products
30 Rubber & Miscellaneous Plastics Products
31 Leather & Leather Products
32 Stone, Clay, & Glass Products
33 Primary Metal Industries
34 Fabricated Metal Products
35 Industrial Machinery & Equipment
36 Electronic & Other Electric Equipment
37 Transportation Equipment
38 Instruments & Related Products
39 Miscellaneous Manufacturing Industries" %>% strsplit("\n") %>% unlist %>% Reduce(f=function(out, s) {
  s %<>% strsplit(" ") %>% unlist
  code <- s[1]
  name <- paste(s[2:length(s)],collapse = " ")
  rbind(out, data.frame(industryCode = code, industry = name))
  }, init = data.frame(industryCode = character(0), industry = character(0))) -> twodigitTable




## coerce four-digit codes to two-digit using integer division
## join two-digit industry with our main data frame
usaReg.clean$industryCode <- usaReg.clean$Product %>% as.character %>% as.integer %>% `/`(100) %>% as.integer %>% as.character
usaReg.clean %<>% left_join(twodigitTable, c("industryCode"))
usaReg.clean$industry %<>% as.factor
usaReg.clean$industryCode %<>% as.factor

rm(empdiffTable, twodigitTable, usa, usaReg)
################# DATA PROCESSING DONE ##########################




## Create a linear model
## Include residuals and fitted values
usaReg.lm <- lm(empDiff ~ tarDiff:dYear + logSkill + logCap + industryCode + dYear, data=usaReg.clean)
usaReg.clean$resid <- resid(usaReg.lm)
usaReg.clean$fitted <- fitted(usaReg.lm)




## Create diagnostic plots
line.color <- "#6C10E8"
graphops <- theme_bw() + theme(plot.title = element_text(hjust=0.5))
graphbase <- ggplot(usaReg.clean)

fittedvr.plot <- graphbase + geom_point(aes(x=fitted, y=resid)) + labs(title="Fitted vs. Residual Plot", x="Fitted Values", y="Residuals") + graphops
industryvr.plot <- graphbase + geom_boxplot(aes(x=industryCode, y=resid)) + labs(title="Boxplot of Residuals by Major Industry",x="Two Digit SIC",y="Residuals") + graphops
skillvr.plot <- graphbase + geom_point(aes(x=logSkill,y=resid)) + labs(title="logSkill vs. Residual Plot", x="log Skill Intensity", y="Residuals") + graphops
capvr.plot <- graphbase + geom_point(aes(x=logCap,y=resid)) + labs(title="logCap vs. Residual Plot", x="log Capital Intensity", y="Residuals") + graphops
yearvr.plot <- graphbase + geom_boxplot(aes(x=Tariff.Year, y=resid)) + labs(title="Boxplot of Residuals by Year",x="Year",y="Residuals") + graphops

rm(graphbase, line.color)




set.seed(549)
## Residual Resampling:
## Take re-sampled residuals, make new "responses"
## Refit model to these Ystar
## see ?boot::boot for info on bootstrapping in R
b1.bootstrap <- boot(usaReg.clean[,"resid"], function(x,i){
  Ystar <- usaReg.clean[,"fitted"] + x[i]
  coef(lm(Ystar ~ tarDiff:dYear + logSkill + logCap + dYear + industryCode, data=usaReg.clean))["tarDiff:dYear"]
}, R = 1e4)




## Create a histogram of the bootstrap density
## packages sandwich and lmtest used to get
## White Standard Errors of coefficient estimate
b1.theoretical <- vcovHC(usaReg.lm, type="HC") %>% 
  coeftest(usaReg.lm, vcov. = .) %>% # this grabs the mean and std error of the robust estimate
  `[`("tarDiff:dYear",c("Estimate","Std. Error"))

xseq <- seq(-1, 1, .001) # xseq and dnormx used to draw theoretical density
dnormx <- function(x) dnorm(x, mean=b1.theoretical[1], sd=b1.theoretical[2])

b1.hist <- ggplot(data.frame(x=b1.bootstrap$t)) + 
  geom_histogram(aes(x=x, y=..density..), bins=30, fill="white", col="black") + 
  stat_function(data = data.frame(x=xseq), fun=dnormx, col="red") + 
  labs(title=TeX("Histogram of Bootstrapped $\\beta_1$ Samples"), x=TeX("$\\beta_1$")) +
  graphops

rm(xseq, dnormx)




## Create an MC sample of the disemployment effects
mc.employment <- sample(usaReg.clean$tarDiff, 1e6, replace = T) * sample(b1.bootstrap$t,1e6, replace = T)



## Use a histogram to describe the disemployment effects
## Exclude outliers by creating mc.employment.retain
`%between%` <- function(x, lim) x[which(x >= lim[1] & x <= lim[2])]
mc.employment.retain <- mc.employment %between% quantile(mc.employment, c(.005,.995))

mc.employment.hist <- ggplot(data.frame(x=mc.employment.retain)) +
  geom_histogram(aes(x=x, y=..density..), bins=30, fill="white", col="black") +
  labs(title = "Retained Monte Carlo Samples of Employment Difference",
       x=TeX("$\\Delta$Employment")) + graphops
rm(`%between%`,mc.employment.retain, graphops)