#Seth A. Berkowitz, MD MPH"
#Code adapted from Kara Rudolph https://rss.onlinelibrary.wiley.com/doi/full/10.1111/rssb.12213#
#Code cannot be used for commericial purposes and all use must attribute original authors#

## a variable needs to be named a and have values 0/1
## site variable needs to be named 'site' and needs to have value 0 for the site where the outcome data is not used and value 1 for the site where the outcome data is used
## z variable needs to be named z and have values 0/1
## y variable needs to be named y and have values 0/1
## w variables in a dataframe named w and with names w1:wx

ittatetmle<-function(a, z, y, site, w, aamodel, asitemodel, azmodel, aoutmodel, aq2model){
  datw<-w
  n.dat<-nrow(datw)
  
  #calculate components of clever covariate
  cpa <- predict(glm(formula=aamodel,  family="binomial", data=data.frame(cbind(datw, a=a))), newdata=datw, type="response")
  cps <- predict(glm(formula=asitemodel, data=data.frame(cbind(site=site, datw)), family="binomial"), type="response")
  
  zmodels0 <- glm(formula=azmodel, data=data.frame(cbind(a=a, z=z, site=site, datw)), subset=site==0, family="binomial")
  zmodels1 <- glm(formula=azmodel,  data=data.frame(cbind(a=a, z=z, site=site, datw)),subset=site==1, family="binomial")
  data_new0<-data_new1<-datw
  data_new0$a<-0
  data_new1$a<-1
  
  dga1s0<-dbinom(z, 1,prob=predict(zmodels0, newdata=data_new1, type="response"))
  dga1s1<-dbinom(z, 1,prob=predict(zmodels1, newdata=data_new1, type="response"))
  dga0s0<-dbinom(z, 1,prob=predict(zmodels0, newdata=data_new0, type="response"))
  dga0s1<-dbinom(z, 1,prob=predict(zmodels1, newdata=data_new0, type="response"))
  
  #calculate clever covariate
  g0w<-(1-cpa)*(dga0s1/dga0s0)*(cps/(1-cps))
  g1w<-cpa*(dga1s1/dga1s0)*(cps/(1-cps))
  h0w<-((1-a)*I(site==1))/g0w
  h1w<-(a*I(site==1))/g1w
  
  ymodel<-glm(formula=aoutmodel,family="binomial", data=data.frame(cbind(datw, a=a,z=z, site=site, y=y)), subset=site==1)
  
  #initial prediciton
  q<-cbind(predict(ymodel, type="link", newdata=data.frame(cbind(datw, a=a,z=z))), predict(ymodel, type="link", newdata=data.frame(cbind(datw, a=0,z=z))), predict(ymodel, type="link", newdata=data.frame(cbind(datw, a=1,z=z))))
  
  epsilon<-coef(glm(y ~ -1 + offset(q[,1]) + h0w + h1w , family="binomial", subset=site==1 ))
  
  #update initial prediction
  q1<- q + c((epsilon[1]*h0w + epsilon[2]*h1w), epsilon[1]/g0w, epsilon[2]/g1w)
  
  predmodela0<-suppressWarnings(glm(formula=paste("plogis(q1)", aq2model, sep="~"), data=data.frame(cbind(w, a=a,site=site, q1=q1[,2])), subset=site==0 & a==0 , family="binomial"))
  predmodela1<-suppressWarnings(glm(formula=paste("plogis(q1)", aq2model, sep="~"), data=data.frame(cbind(w,a=a, site=site, q1=q1[,3])), subset=site==0 & a==1 , family="binomial"))
  predmodelaa<-suppressWarnings(glm(formula=paste("plogis(q1) ~", aq2model, "+a", sep=""), data=data.frame(cbind(w, site=site, q1=q1[,1], a=a)), subset=site==0, family="binomial"))
  
  #get initial prediction for second regression model 
  q2pred<-cbind(predict(predmodelaa, type="link", newdata=data.frame(cbind(datw, a=a))), predict(predmodela0, type="link", newdata=datw), predict(predmodela1, type="link", newdata=datw))
  
  cz<-cbind(ifelse(a==0,I(site==0)/(1-cpa), I(site==0)/cpa), I(site==0)/(1-cpa),  I(site==0)/cpa)
  
  epsilon2<-suppressWarnings(coef(glm(plogis(q1[,1]) ~ -1 + offset(q2pred[,1]) + cz[,2] + cz[,3], family="binomial", subset= site==0)))
  for(k in 1:2){
    epsilon2[k]<-ifelse(is.na(epsilon2[k]), 0, epsilon2[k])
  }
  
  q2<- q2pred + c((epsilon2[1]*cz[,2] + epsilon2[2]*cz[,3]), epsilon2[1]/(1-cpa), epsilon2[2]/cpa)
  
  tmleest<-mean(plogis(q2[,3][site==0]))-mean(plogis(q2[,2][site==0]))
  
  
  
  ps0<-mean(I(site==0))
  
  eic<-(((h1w/ps0) - (h0w/ps0))*(y - plogis(q[,1]))) +  (((a*cz[,3]/ps0) - ((1-a)*cz[,2]/ps0))* (plogis(q[,1]) - plogis(q2pred[,1]))) + ((I(site==0)/ps0)*((plogis(q2pred[,3]) - plogis(q2pred[,2])) - tmleest))
  
  #seth additions
  tmle_treated <- mean(plogis(q2[,3] [site==0]))
  tmle_untreated <- mean(plogis(q2[,2] [site==0]))
  variance_rd <- var(eic)/n.dat
  rd_pvalue <- 2 * pnorm(-abs(tmleest/sqrt(variance_rd)))
  tmleest_95CI_rd <- c(tmleest - (1.96*(sqrt(variance_rd))),
                       tmleest + (1.96*(sqrt(variance_rd))))
  
  
  
  return(list("est"=tmleest, "var"=var(eic)/n.dat, "eic"=eic, 
              "untreated"=tmle_untreated, "treated"=tmle_treated, "h1w"=h1w, "h0w"=h0w, 
              "confint_rd"= tmleest_95CI_rd, "rd_pvalue"=rd_pvalue, "q2" = q2))
  
}










library(sas7bdat)


#set directory where your cleaned and formatted data is located#
setwd ("H:TOPCAT/data/created data")



#main model, PO

dat <- read.sas7bdat("transport_s1.sas7bdat")
dat1 <- dat[complete.cases(dat),]

wmat <-  dat1[, 9:22]
names(wmat)[1:14] <- sprintf("w%d", 1:14)


amodel<-"a ~ 1"
sitemodel<-"site ~ w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12 + w13 + w14 "
zmodel<-"z ~ 1"
outmodel<-"y ~ z + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12 + w13 + w14 "
outmodelnoz<-"y ~ a + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12 + w13 + w14 "
q2model<-"w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12 + w13 + w14 "


ittatetmletransport<-ittatetmle(a=dat1$treatment, z=dat1$instrument, y=dat1$po, site=dat1$site, w=wmat, aamodel=amodel, 
                                asitemodel=sitemodel, azmodel=zmodel, aoutmodel=outmodel, aq2model=q2model 
)

ittatetmletransport$est
ittatetmletransport$var
ittatetmletransport$treated
ittatetmletransport$untreated
ittatetmletransport$confint_rd
ittatetmletransport$rd_pvalue


#HL test
q2 <- as.data.frame(ittatetmletransport$q2)

dat1$expected <- plogis(q2$V1)
dat2 <- subset(dat1, site==0)
dat_treated <- subset(dat1, site==0 & treatment==1)
dat_untreated <- subset(dat1, site==0 & treatment==0)
dat_untransported <- subset(dat1, site==1)

library(ResourceSelection)
test <- hoslem.test(dat2$po, dat2$expected, g = 10)
test

#Plot
library(givitiR)
cb <- givitiCalibrationBelt(dat2$po, dat2$expected, "external")
plot(cb, main = "TMLE calibration--Primary Outcome",
     xlab = "TMLE predicted risk of primary outcome",
     ylab = "Observed risk of primary outcome")


#untreated

library(ResourceSelection)
test <- hoslem.test(dat_untreated$po, dat_untreated$expected, g = 10)
test

cb <- givitiCalibrationBelt(dat_untreated$po, dat_untreated$expected, "external")
plot(cb, main = "TMLE calibration--Primary Outcome in Untreated",
     xlab = "TMLE predicted probability of mortality",
     ylab = "Observed risk of mortality")


#treated

library(ResourceSelection)
test <- hoslem.test(dat_treated$po, dat_treated$expected, g = 10)
test

cb <- givitiCalibrationBelt(dat_treated$po, dat_treated$expected, "external")
plot(cb, main = "TMLE calibration--Primary Outcome in Treated",
     xlab = "TMLE predicted probability of mortality",
     ylab = "Observed risk of mortality")



#main model, Mortality

dat <- read.sas7bdat("transport_s1.sas7bdat")
dat1 <- dat[complete.cases(dat),]

wmat <-  dat1[, 9:22]
names(wmat)[1:14] <- sprintf("w%d", 1:14)


amodel<-"a ~ 1"
sitemodel<-"site ~ w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12 + w13 + w14 "
zmodel<-"z ~ 1"
outmodel<-"y ~ z + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12 + w13 + w14 "
outmodelnoz<-"y ~ a + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12 + w13 + w14 "
q2model<-"w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12 + w13 + w14 "


ittatetmletransport<-ittatetmle(a=dat1$treatment, z=dat1$instrument, y=dat1$died, site=dat1$site, w=wmat, aamodel=amodel, 
                                asitemodel=sitemodel, azmodel=zmodel, aoutmodel=outmodel, aq2model=q2model 
)

ittatetmletransport$est
ittatetmletransport$var
ittatetmletransport$treated
ittatetmletransport$untreated
ittatetmletransport$confint_rd
ittatetmletransport$rd_pvalue


#HL test
q2 <- as.data.frame(ittatetmletransport$q2)

dat1$expected <- plogis(q2$V1)
dat2 <- subset(dat1, site==0)
dat_treated <- subset(dat1, site==0 & treatment==1)
dat_untreated <- subset(dat1, site==0 & treatment==0)
dat_untransported <- subset(dat1, site==1)

library(ResourceSelection)
test <- hoslem.test(dat2$died, dat2$expected, g = 10)
test

#Plot
library(givitiR)
cb <- givitiCalibrationBelt(dat2$died, dat2$expected, "external")
plot(cb, main = "TMLE calibration--Total Mortality",
     xlab = "TMLE predicted probability of mortality",
     ylab = "Observed risk of mortality")

#untreated

library(ResourceSelection)
test <- hoslem.test(dat_untreated$died, dat_untreated$expected, g = 10)
test

cb <- givitiCalibrationBelt(dat_untreated$died, dat_untreated$expected, "external")
plot(cb, main = "TMLE calibration--Total Mortality in Untreated",
     xlab = "TMLE predicted probability of mortality",
     ylab = "Observed risk of mortality")


#treated

library(ResourceSelection)
test <- hoslem.test(dat_treated$died, dat_treated$expected, g = 10)
test

cb <- givitiCalibrationBelt(dat_treated$died, dat_treated$expected, "external")
plot(cb, main = "TMLE calibration--Total Mortality in Treated",
     xlab = "TMLE predicted probability of mortality",
     ylab = "Observed risk of mortality")