library(tidyverse) expit <- function(x) { exp(x)/(1+exp(x)) } ########################################################################## # When there is no association between X and Y there can be an apparent # association between X and Ypred
 ########################################################################## set.seed(1) # Generate auxiliary data N<-10000 #Generate exposure Exposure<-runif(N,40,70) # Generate non-collider covariates that will predict Y Homeowner<-rbinom(N,1,.52) Population<-rnorm(N,3976,6765) NO2<-rnorm(N,10.1,4.5) PM25<-rnorm(N,9.1,1.9) # Generate collider covariates to go into Ypred model # Poverty is strong collider between Homeowner and Exposure PovertyPredictor<-(1-Homeowner)*Exposure+.1*(Homeowner)*Exposure Poverty<-rbinom(N,1,PovertyPredictor/max(PovertyPredictor)) Race<-rmultinom(N,1,prob=c(.2,.45,1-.65)) White<-Race[2,] Black<-Race[1,] Female<-rbinom(N,1,.45) Old<-rbinom(N,1,.05) #Generate Y based on Homeowner but not Poverty so that the path between Exposure and Y is collided Y<-rbinom(N,1,(1-Homeowner)*.7+Homeowner*.3) #Save data auxiliarydata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female, Old)) #Overwrite this to generate analysis data N<-1000 #Generate exposure Exposure<-runif(N) # Generate non-collider covariates that will predict Y Homeowner<-rbinom(N,1,.52) Population<-rnorm(N,3976,6765) NO2<-rnorm(N,10.1,4.5) PM25<-rnorm(N,9.1,1.9) # Generate collider covariates to go into Ypred model # Poverty is strong collider between Homeowner and Exposure PovertyPredictor<-(1-Homeowner)*Exposure+.1*(Homeowner)*Exposure Poverty<-rbinom(N,1,PovertyPredictor/max(PovertyPredictor)) Race<-rmultinom(N,1,prob=c(.2,.45,1-.65)) White<-Race[2,] Black<-Race[1,] Female<-rbinom(N,1,.45) Old<-rbinom(N,1,.05) #Generate Y based on Homeowner but not Poverty so that the path between Exposure and Y is collided Y<-rbinom(N,1,(1-Homeowner)*.7+Homeowner*.3) #Save data analysisdata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female, Old)) # Create analysis data outcome variable #Predict Y using auxiliary data Model1<-glm(Y~Poverty+Black+White+Female+Old, family=binomial, data=auxiliarydata) #Get fitted values for analysis data Ypred<-predict(Model1,newdata=analysisdata,type="response") #Analyze data # Unadjusted model shows strong association p1<-glm(Ypred~Exposure, data=analysisdata, family=binomial()) summary(p1) # Using Y instead of Ypred, the true association is null p1true<-glm(Y~Exposure, data=analysisdata, family=binomial()) summary(p1true) # Figure using Ypred hyp_newdata1b <- with(analysisdata, data.frame(Exposure)) hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1, newdata = hyp_newdata1b, type = "link", se = TRUE)) hyp_newdata2b <- within(hyp_newdata2b, { PredictedProb <- plogis(fit) LL <- plogis(fit - (1.96 * se.fit)) UL <- plogis(fit + (1.96 * se.fit)) }) ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + geom_line(aes(), size = 1) + scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.3,.35, .4 ,.45, .5, .55,.6), limits=c(.3, .6)) + theme_minimal(base_size = 14) + scale_x_continuous("Average day-night noise level (dBA)",breaks=c(0,0.33,0.67,1),labels=c("40","50","60","70")) # Figure using the true Y hyp_newdata1b <- with(analysisdata, data.frame(Exposure)) hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1true, newdata = hyp_newdata1b, type = "link", se = TRUE)) hyp_newdata2b <- within(hyp_newdata2b, { PredictedProb <- plogis(fit) LL <- plogis(fit - (1.96 * se.fit)) UL <- plogis(fit + (1.96 * se.fit)) }) ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + geom_line(aes(), size = 1) + scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.3,.35, .4 ,.45, .5, .55,.6), limits=c(.3, .6)) + theme_minimal(base_size = 14) + scale_x_continuous("Average day-night noise level (dBA)",breaks=c(0,0.33,0.67,1),labels=c("40","50","60","70")) dev.off() ########################################################################## # When there is an association between X and Y the association between # X and Ypred can go in the opposite direction or be null
 ########################################################################## set.seed(1) # Generate auxiliary data N<-10000 # Generate covariates # Poverty will be a confounder Race<-rmultinom(N,1,prob=c(.2,.45,1-.65)) White<-Race[2,] Black<-Race[1,] Poverty<-rbinom(N,1,.3*Black+.08*White) Homeowner<-rbinom(N,1,.1*Poverty*Black+.2*Poverty+.2) Population<-rnorm(N,3976,6765) NO2<-rnorm(N,10.1,4.5) PM25<-rnorm(N,9.1,1.9) Female<-rbinom(N,1,.45) Old<-rbinom(N,1,.05) #Generate exposure, positive relationship with Poverty exp_mean<-Poverty*60+(1-Poverty)*52 Exposure<-rnorm(N,exp_mean,4) #Generate Y, negatively associated with Poverty and positively associated with Exposure # Ypredictor<-Poverty*(Exposure-20)+(1-Poverty)*(Exposure+20) Ypredictor<-Poverty*(Exposure-15)+(1-Poverty)*(Exposure+15) Y<-rbinom(N,1,Ypredictor/max(Ypredictor)) #Save data auxiliarydata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female,Old)) #Overwrite this to generate analysis data N<-1000 # Generate covariates # Poverty will be a confounder Race<-rmultinom(N,1,prob=c(.2,.45,1-.65)) White<-Race[2,] Black<-Race[1,] Poverty<-rbinom(N,1,.3*Black+.08*White) Homeowner<-rbinom(N,1,.1*Poverty*Black+.2*Poverty+.2) Population<-rnorm(N,3976,6765) NO2<-rnorm(N,10.1,4.5) PM25<-rnorm(N,9.1,1.9) Female<-rbinom(N,1,.45) Old<-rbinom(N,1,.05) #Generate exposure, positive relationship with Poverty exp_mean<-Poverty*60+(1-Poverty)*50 Exposure<-rnorm(N,exp_mean,5) #Generate Y, negatively associated with Poverty and positively associated with Exposure # Ypredictor<-Poverty*(Exposure-20)+(1-Poverty)*(Exposure+20) Ypredictor<-Poverty*(Exposure-15)+(1-Poverty)*(Exposure+15) Y<-rbinom(N,1,Ypredictor/max(Ypredictor)) #Save data analysisdata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female, Old)) # Create analysis data outcome variable #Predict Y using auxiliary data Model1<-glm(Y~Poverty+Black+White+Female+Old, family=binomial, data=auxiliarydata) #Get fitted values for analysis data Ypred<-predict(Model1,newdata=analysisdata,type="response") #Analyze data # Unadjusted model shows strong association p1<-glm(Ypred~Exposure, data=analysisdata, family=binomial()) summary(p1) # Using Y instead of Ypred, the true association is null p1true<-glm(Y~Exposure, data=analysisdata, family=binomial()) summary(p1true) # plot for Ypred hyp_newdata1b <- with(analysisdata, data.frame(Exposure)) hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1, newdata = hyp_newdata1b, type = "link", se = TRUE)) hyp_newdata2b <- within(hyp_newdata2b, { PredictedProb <- plogis(fit) LL <- plogis(fit - (1.96 * se.fit)) UL <- plogis(fit + (1.96 * se.fit)) }) tiff(filename = "~/output.tiff", type = "cairo" , res=300, height = 4, width = 4, units = "in") ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + geom_line(aes(), size = 1) + scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.4, .5 ,.6, .7, .8, .9), limits=c(0.4, 1)) + theme_minimal(base_size = 14) + scale_x_continuous("Average day-night noise level (dBA)",breaks=c(40, 50,60,70),labels=c("40","50","60","70")) dev.off() # plot for true Y hyp_newdata1b <- with(analysisdata, data.frame(Exposure)) hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1true, newdata = hyp_newdata1b, type = "link", se = TRUE)) hyp_newdata2b <- within(hyp_newdata2b, { PredictedProb <- plogis(fit) LL <- plogis(fit - (1.96 * se.fit)) UL <- plogis(fit + (1.96 * se.fit)) }) tiff(filename = "~/output.tiff", type = "cairo" , res=300, height = 4, width = 4, units = "in") ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + geom_line(aes(), size = 1) + scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.4, .5 ,.6, .7, .8, .9), limits=c(0.4, 1)) + theme_minimal(base_size = 14) + scale_x_continuous("Average day-night noise level (dBA)",breaks=c(40, 50,60,70),labels=c("40","50","60","70")) dev.off() ########################################################################## # There is a strong association between Exposure and true Y, but the relationship # between Exposure and Ypred is null--even though there are no confounders
 ########################################################################## set.seed(1) # Generate auxiliary data N<-10000 # Generate covariates Race<-rmultinom(N,1,prob=c(.2,.45,1-.65)) White<-Race[2,] Black<-Race[1,] Poverty<-rbinom(N,1,.3*Black+.08*White) Homeowner<-rbinom(N,1,.1*Poverty*Black+.2*Poverty+.2) Population<-rnorm(N,3976,6765) NO2<-rnorm(N,10.1,4.5) PM25<-rnorm(N,9.1,1.9) Female<-rbinom(N,1,.45) Old<-rbinom(N,1,.05) #Generate exposure independent of covariates--so there is no confounding Exposure<-rnorm(N,55,7) #Generate Y based on covariates above, and exposure Error<-rnorm(N,2,1) Predictor<-White+Black+Poverty+Homeowner+NO2/max(NO2)+PM25/max(PM25)+Old+Female+ Population/max(Population)+ 10*Exposure/max(Exposure) Probability<-(abs(Predictor/max(Predictor))) Y<-rbinom(N,1,prob=Probability) #Save data auxiliarydata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female,Old)) #Overwrite this to generate analysis data N<-1000 # Generate covariates Race<-rmultinom(N,1,prob=c(.2,.45,1-.65)) White<-Race[2,] Black<-Race[1,] Poverty<-rbinom(N,1,.3*Black+.08*White) Homeowner<-rbinom(N,1,.1*Poverty*Black+.2*Poverty+.2) Population<-rnorm(N,3976,6765) NO2<-rnorm(N,10.1,4.5) PM25<-rnorm(N,9.1,1.9) Female<-rbinom(N,1,.45) Old<-rbinom(N,1,.05) #Generate exposure independent of covariates--so there is no confounding Exposure<-rnorm(N,55,5) #Generate Y based on covariates above, and exposure Error<-rnorm(N,2,1) Predictor<-White+Black+Poverty+Homeowner+NO2/max(NO2)+PM25/max(PM25)+Old+Female+ Population/max(Population) + 10*Exposure/max(Exposure) Probability<-(abs(Predictor/max(Predictor))) Y<-rbinom(N,1,prob=Probability) #Save data analysisdata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female, Old)) # Create analysis data outcome variable #Predict Y using auxiliary data Model1<-glm(Y~Poverty+Black+White+Female+Old, family=binomial, data=auxiliarydata) #Get fitted values for analysis data Ypred<-predict(Model1,newdata=analysisdata,type="response") #Analyze data # Unadjusted model shows strong association p1<-glm(Ypred~Exposure, data=analysisdata, family=binomial()) summary(p1) # Using Y instead of Ypred, the true association is null p1true<-glm(Y~Exposure, data=analysisdata, family=binomial()) summary(p1true) # plot for Ypred hyp_newdata1b <- with(analysisdata, data.frame(Exposure)) hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1, newdata = hyp_newdata1b, type = "link", se = TRUE)) hyp_newdata2b <- within(hyp_newdata2b, { PredictedProb <- plogis(fit) LL <- plogis(fit - (1.96 * se.fit)) UL <- plogis(fit + (1.96 * se.fit)) }) tiff(filename = "~/output.tiff", type = "cairo" , res=300, height = 4, width = 4, units = "in") ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + geom_line(aes(), size = 1) + scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.4, .5 ,.6, .7, .8, .9), limits=c(0.4, 1)) + theme_minimal(base_size = 14) + scale_x_continuous("Average day-night noise level (dBA)",breaks=c(40, 50,60,70),labels=c("40","50","60","70")) dev.off() # plot for true Y hyp_newdata1b <- with(analysisdata, data.frame(Exposure)) hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1true, newdata = hyp_newdata1b, type = "link", se = TRUE)) hyp_newdata2b <- within(hyp_newdata2b, { PredictedProb <- plogis(fit) LL <- plogis(fit - (1.96 * se.fit)) UL <- plogis(fit + (1.96 * se.fit)) }) tiff(filename = "~/output.tiff", type = "cairo" , res=300, height = 4, width = 4, units = "in") ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + geom_line(aes(), size = 1) + scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.4, .5 ,.6, .7, .8, .9), limits=c(0.4, 1)) + theme_minimal(base_size = 14) + scale_x_continuous("Average day-night noise level (dBA)",breaks=c(40, 50,60,70),labels=c("40","50","60","70")) dev.off() ########################################################################## # When there is an association between Ypred and Exposure, it becomes null # when controlling for variables in the Ypred model ########################################################################## set.seed(1) # Generate auxiliary data N<-10000 # Generate covariates Race<-rmultinom(N,1,prob=c(.2,.45,1-.65)) White<-Race[2,] Black<-Race[1,] Poverty<-rbinom(N,1,.3*Black+.08*White) Homeowner<-rbinom(N,1,.1*Poverty*Black+.2*Poverty+.2) Population<-rnorm(N,3976,6765) NO2<-rnorm(N,10.1,4.5) PM25<-rnorm(N,9.1,1.9) Female<-rbinom(N,1,.45) Old<-rbinom(N,1,.05) #Generate exposure based on Poverty exp_mean<-Poverty*60+(1-Poverty)*52 Exposure<-rnorm(N,exp_mean,4) #Generate Y based on Black, White and Exposure (no common causes with exposure so no confounding) Error<-rnorm(N,2,1) Predictor<-White+Black+Exposure/max(Exposure) Probability<-(abs(Predictor/max(Predictor))) Y<-rbinom(N,1,prob=Probability) #Save data auxiliarydata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female,Old)) #Overwrite this to generate analysis data N<-1000 # Generate covariates Race<-rmultinom(N,1,prob=c(.2,.45,1-.65)) White<-Race[2,] Black<-Race[1,] Poverty<-rbinom(N,1,.3*Black+.08*White) Homeowner<-rbinom(N,1,.1*Poverty*Black+.2*Poverty+.2) Population<-rnorm(N,3976,6765) NO2<-rnorm(N,10.1,4.5) PM25<-rnorm(N,9.1,1.9) Female<-rbinom(N,1,.45) Old<-rbinom(N,1,.05) #Generate exposure based on Poverty exp_mean<-Poverty*60+(1-Poverty)*52 Exposure<-rnorm(N,exp_mean,4) #Generate Y based on Black, White and Exposure (no common causes with exposure so no confounding) Error<-rnorm(N,2,1) Predictor<-White+Black+Exposure/max(Exposure) Probability<-(abs(Predictor/max(Predictor))) Y<-rbinom(N,1,prob=Probability) #Save data analysisdata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female, Old)) # Create analysis data outcome variable #Predict Y using auxiliary data Model1<-glm(Y~Poverty+Black+White+Female+Old, family=binomial, data=auxiliarydata) #Get fitted values for analysis data Ypred<-predict(Model1,newdata=analysisdata,type="response") #Analyze data # Unadjusted model shows strong association p1<-glm(Ypred~Exposure, data=analysisdata, family=binomial()) summary(p1) p2<-glm(Ypred~Exposure+Black+White, data=analysisdata, family=binomial()) summary(p2) # Using Y instead of Ypred, the true association is null p1true<-glm(Y~Exposure, data=analysisdata, family=binomial()) summary(p1true) p2true<-glm(Y~Exposure+Black+White, data=analysisdata, family=binomial()) summary(p2true) # plot for Ypred unadjusted hyp_newdata1b <- with(analysisdata, data.frame(Exposure)) hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1, newdata = hyp_newdata1b, type = "link", se = TRUE)) hyp_newdata2b <- within(hyp_newdata2b, { PredictedProb <- plogis(fit) LL <- plogis(fit - (1.96 * se.fit)) UL <- plogis(fit + (1.96 * se.fit)) }) tiff(filename = "~/output.tiff", type = "cairo" , res=300, height = 4, width = 4, units = "in") ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + geom_line(aes(), size = 1) + scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.4, .5 ,.6, .7, .8, .9), limits=c(0.4, 1)) + theme_minimal(base_size = 14) + scale_x_continuous("Average day-night noise level (dBA)",breaks=c(40, 50,60,70),labels=c("40","50","60","70")) dev.off() # plot for Ypred adjusted hyp_newdata1b <- with(analysisdata, data.frame(Exposure,Black=mean(Black),White=mean(White))) hyp_newdata2b <- cbind(hyp_newdata1b, predict(p2, newdata = hyp_newdata1b, type = "link", se = TRUE)) hyp_newdata2b <- within(hyp_newdata2b, { PredictedProb <- plogis(fit) LL <- plogis(fit - (1.96 * se.fit)) UL <- plogis(fit + (1.96 * se.fit)) }) tiff(filename = "~/output.tiff", type = "cairo" , res=300, height = 4, width = 4, units = "in") ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + geom_line(aes(), size = 1) + scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.4, .5 ,.6, .7, .8, .9), limits=c(0, 1)) + theme_minimal(base_size = 14) + scale_x_continuous("Average day-night noise level (dBA)",breaks=c(40, 50,60,70),labels=c("40","50","60","70")) dev.off() # plot for true Y adjusted hyp_newdata1b <- with(analysisdata, data.frame(Exposure,Black=mean(Black),White=mean(White))) hyp_newdata2b <- cbind(hyp_newdata1b, predict(p2true, newdata = hyp_newdata1b, type = "link", se = TRUE)) hyp_newdata2b <- within(hyp_newdata2b, { PredictedProb <- plogis(fit) LL <- plogis(fit - (1.96 * se.fit)) UL <- plogis(fit + (1.96 * se.fit)) }) tiff(filename = "~/output.tiff", type = "cairo" , res=300, height = 4, width = 4, units = "in") ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + geom_line(aes(), size = 1) + scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.4, .5 ,.6, .7, .8, .9), limits=c(0, 1)) + theme_minimal(base_size = 14) + scale_x_continuous("Average day-night noise level (dBA)",breaks=c(40, 50,60,70),labels=c("40","50","60","70")) dev.off() ########################################################################## # When there is no association between X and Y there can be an apparent # association between Xpred and Y ########################################################################## set.seed(1) # Generate auxiliary data N<-10000 #Generate outcome Y<-rnorm(N) # Generate non-collider covariates that will predict Exposure Homeowner<-rbinom(N,1,.52) Population<-rnorm(N,3976,6765) NO2<-rnorm(N,10.1,4.5) PM25<-rnorm(N,9.1,1.9) # Generate collider covariates to go into ExpPred model # Poverty is strong collider between Homeowner and Y PovertyPredictor<-(1-Homeowner)*Y+.1*(Homeowner)*Y Poverty<-rnorm(N,PovertyPredictor,1) Race<-rmultinom(N,1,prob=c(.2,.45,1-.65)) White<-Race[2,] Black<-Race[1,] Female<-rbinom(N,1,.45) Old<-rbinom(N,1,.05) #Generate Exposure based on Homeowner but not Poverty so that the path between Exposure and Y is collided Exposure<-rnorm(N,(1-Homeowner)*.7+Homeowner*.3,1) #Save data auxiliarydata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female, Old)) #Overwrite this to generate analysis data N<-1000 #Generate outcome Y<-rnorm(N) # Generate non-collider covariates that will predict Exposure Homeowner<-rbinom(N,1,.52) Population<-rnorm(N,3976,6765) NO2<-rnorm(N,10.1,4.5) PM25<-rnorm(N,9.1,1.9) # Generate collider covariates to go into ExpPred model # Poverty is strong collider between Homeowner and Y PovertyPredictor<-(1-Homeowner)*Y+.1*(Homeowner)*Y Poverty<-rnorm(N,PovertyPredictor,1) Race<-rmultinom(N,1,prob=c(.2,.45,1-.65)) White<-Race[2,] Black<-Race[1,] Female<-rbinom(N,1,.45) Old<-rbinom(N,1,.05) #Generate Exposure based on Homeowner but not Poverty so that the path between Exposure and Y is collided Exposure<-rnorm(N,(1-Homeowner)*.7+Homeowner*.3,1) #Save data analysisdata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female, Old)) # Create analysis data outcome variable #Predict Exposure using auxiliary data Model1<-glm(Exposure~Poverty+Black+White+Female+Old, family=gaussian, data=auxiliarydata) #Get fitted values for analysis data analysisdata$ExpPred<-predict(Model1,newdata=analysisdata,type="response") #Analyze data # Unadjusted model shows strong association p1<-glm(Y~ExpPred, data=analysisdata, family=gaussian) summary(p1) # Using Y instead of Ypred, the true association is null p1true<-glm(Y~Exposure, data=analysisdata, family=gaussian) summary(p1true) analysisdata$predictY<-predict(p1,newdata=analysisdata,type="response") analysisdata$predictYtrue<-predict(p1true, newdata=analysisdata,type="response") plot(ExpPred,predictY,data=analysisdata) plot(Exposure,predictYtrue,data=analysisdata) # Figure using ExpPred hyp_newdata1b <- with(analysisdata, data.frame(ExpPred)) hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1, newdata = hyp_newdata1b, type = "response", se = TRUE)) hyp_newdata2b <- within(hyp_newdata2b, { PredictedProb <-(fit) LL <-(fit - (1.96 * se.fit)) UL <-(fit + (1.96 * se.fit)) }) ggplot(hyp_newdata2b, aes(x = ExpPred, y = fit)) + geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + geom_line(aes(), size = 1) + scale_y_continuous("Predicted Proportion Hypertension", # breaks = c(.3,.35, .4 ,.45, .5, .55,.6), #limits=c(-1, 2) ) + theme_minimal(base_size = 14) + scale_x_continuous("Average day-night noise level (dBA)", #breaks=c(0,0.33,0.67,1),labels=c("40","50","60","70") ) # Figure using the true Exposure hyp_newdata1b <- with(analysisdata, data.frame(Exposure)) hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1true, newdata = hyp_newdata1b, type = "response", se = TRUE)) hyp_newdata2b <- within(hyp_newdata2b, { PredictedProb <-(fit) LL <-(fit - (1.96 * se.fit)) UL <-(fit + (1.96 * se.fit)) }) ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + geom_line(aes(), size = 1) + scale_y_continuous("Predicted Proportion Hypertension", #breaks = c(.3,.35, .4 ,.45, .5, .55,.6), limits=c(-1, 2) ) + theme_minimal(base_size = 14) + scale_x_continuous("Average day-night noise level (dBA)", #breaks=c(0,0.33,0.67,1),labels=c("40","50","60","70") ) dev.off() ########################################################################## # There is a null association between X and Y conditional on C (it appears negative but not significant), # but it is positive and significant conditional on Cpred
 ########################################################################## set.seed(1) # Generate auxiliary data N<-10000 # Generate covariates # C is a confounder and is caused by poverty, which is not a confounder # The direction of C confounding is positive, masking a truly null relationship between X and Y Race<-rmultinom(N,1,prob=c(.2,.45,1-.65)) White<-Race[2,] Black<-Race[1,] Poverty<-rbinom(N,1,.3*Black + .08*White) Female<-rbinom(N,1,.45) Old<-rbinom(N,1,.05) Cmean<-5*Poverty+10*(1-Poverty)+10*rnorm(N) C<-rnorm(N,Cmean,5) #Generate exposure, positive relationship with Poverty #Epredictor<- Exposure<-rnorm(N,C,5) #Generate Y, negatively associated with Poverty and positively associated with Exposure Ymean<- C Y<-rnorm(N,C,5) #Save data auxiliarydata<-as.data.frame(cbind(Y,Exposure,C, Poverty,Black,White,Female,Old)) #Overwrite this to generate analysis data N<-1000 # Generate covariates # C is a confounder and is caused by poverty, which is not a confounder # The direction of C confounding is positive, masking a truly null relationship between X and Y Race<-rmultinom(N,1,prob=c(.2,.45,1-.65)) White<-Race[2,] Black<-Race[1,] Poverty<-rbinom(N,1,.3*Black + .08*White) Female<-rbinom(N,1,.45) Old<-rbinom(N,1,.05) Cmean<-5*Poverty+10*(1-Poverty)+10*rnorm(N) C<-rnorm(N,Cmean,5) #Generate exposure, positive relationship with Poverty #Epredictor<- Exposure<-rnorm(N,C,5) #Generate Y, negatively associated with Poverty and positively associated with Exposure Ymean<- C Y<-rnorm(N,C,5) #Save data analysisdata<-as.data.frame(cbind(Y,Exposure,C, Poverty,Black,White,Female,Old)) # Create analysis data outcome variable #Predict C using auxiliary data Model1<-glm(C~Poverty+Black+White+Female+Old, family=gaussian, data=auxiliarydata) #Get fitted values for analysis data analysisdata$Cpred<-predict(Model1,newdata=analysisdata,type="response") #Analyze data # Model with Cpred shows association p1<-glm(Y~Exposure+Cpred, data=analysisdata, family=gaussian) summary(p1) # Using C instead of Cpred, the true association is null p1true<-glm(Y~Exposure+C, data=analysisdata, family=gaussian) summary(p1true) analysisdata$predictY<-predict(p1,newdata=analysisdata,type="response") analysisdata$predictYtrue<-predict(p1true, newdata=analysisdata,type="response") plot(Exposure,predictY,data=analysisdata) plot(Exposure,predictYtrue,data=analysisdata) # plot for Cpred hyp_newdata1b <- as.data.frame(cbind(analysisdata$Exposure,rep(mean(analysisdata$Cpred),length(analysisdata$Cpred)))) names(hyp_newdata1b)<-c("Exposure","Cpred") hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1, newdata = hyp_newdata1b, type = "response", se = TRUE)) hyp_newdata2b <- within(hyp_newdata2b, { PredictedProb <- fit LL <- fit - (1.96 * se.fit) UL <- fit + (1.96 * se.fit) }) plot(hyp_newdata2b$Exposure,hyp_newdata2b$PredictedProb) tiff(filename = "~/output.tiff", type = "cairo" , res=300, height = 4, width = 4, units = "in") ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + geom_line(aes(), size = 1) + scale_y_continuous("Predicted Proportion Hypertension", #breaks = c(.4, .5 ,.6, .7, .8, .9), #limits=c(-150, 250) ) + theme_minimal(base_size = 14) + scale_x_continuous("Average day-night noise level (dBA)", #breaks=c(40, 50,60,70), #labels=c("40","50","60","70") ) dev.off() # plot for true C hyp_newdata1b <- as.data.frame(cbind(analysisdata$Exposure,rep(mean(analysisdata$C),length(analysisdata$C)))) names(hyp_newdata1b)<-c("Exposure","C") hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1true, newdata = hyp_newdata1b, type = "response", se = TRUE)) hyp_newdata2b <- within(hyp_newdata2b, { PredictedProb <-(fit) LL <-(fit - (1.96 * se.fit)) UL <-(fit + (1.96 * se.fit)) }) plot(hyp_newdata2b$Exposure,hyp_newdata2b$PredictedProb) tiff(filename = "~/output.tiff", type = "cairo" , res=300, height = 4, width = 4, units = "in") ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + geom_line(aes(), size = 1) + scale_y_continuous("Predicted Proportion Hypertension", #breaks = c(.4, .5 ,.6, .7, .8, .9), #limits=c(-150, 250) ) + theme_minimal(base_size = 14) + scale_x_continuous("Average day-night noise level (dBA)"#,breaks=c(40, 50,60,70),labels=c("40","50","60","70") ) dev.off() ########################################################################## # The association between X and Y is significant but in opposite directions controling for C vs Cpred
 ########################################################################## set.seed(1) # Generate auxiliary data N<-10000 # Generate covariates # C is a confounder and is caused by poverty, which is also a confounder # The direction of C confounding is positive, masking a truly negative relationship between X and Y Race<-rmultinom(N,1,prob=c(.2,.45,1-.65)) White<-Race[2,] Black<-Race[1,] Poverty<-rbinom(N,1,.3*Black + .08*White) Female<-rbinom(N,1,.45) Old<-rbinom(N,1,.05) Cmean<-5*Poverty+10*(1-Poverty)+10*rnorm(N) C<-rnorm(N,Cmean,10) #Generate exposure, positive relationship with Poverty Epredictor<-(Poverty+C) Exposure<-rnorm(N,Epredictor,1) #Generate Y, negatively associated with Poverty and positively associated with Exposure Ymean<-Poverty + 5*C - Exposure Y<-rnorm(N,Ymean,50) #Save data auxiliarydata<-as.data.frame(cbind(Y,Exposure,C, Poverty,Black,White,Female,Old)) #Overwrite this to generate analysis data N<-1000 # Generate covariates # C is a confounder and is caused by poverty, which is also a confounder # The direction of C confounding is positive, masking a truly negative relationship between X and Y Race<-rmultinom(N,1,prob=c(.2,.45,1-.65)) White<-Race[2,] Black<-Race[1,] Poverty<-rbinom(N,1,.3*Black + .08*White) Female<-rbinom(N,1,.45) Old<-rbinom(N,1,.05) Cmean<-5*Poverty+10*(1-Poverty)+10*rnorm(N) C<-rnorm(N,Cmean,10) #Generate exposure, positive relationship with Poverty Epredictor<-(Poverty+C) Exposure<-rnorm(N,Epredictor,1) #Generate Y, negatively associated with Poverty and positively associated with Exposure Ymean<-Poverty + 5*C - Exposure Y<-rnorm(N,Ymean,50) #Save data analysisdata<-as.data.frame(cbind(Y,Exposure,C, Poverty,Black,White,Female,Old)) # Create analysis data outcome variable #Predict C using auxiliary data Model1<-glm(C~Poverty+Black+White+Female+Old, family=gaussian, data=auxiliarydata) #Get fitted values for analysis data analysisdata$Cpred<-predict(Model1,newdata=analysisdata,type="response") #Analyze data # Model with Cpred shows association p1<-lm(Y~Exposure+Cpred+Poverty, data=analysisdata) summary(p1) # Using C instead of Cpred, the true association is null p1true<-lm(Y~Exposure+C+Poverty, data=analysisdata) summary(p1true) # PLOT FOR CPred # get predicted values by hand Predicted<-3.0772 + (3.9508*analysisdata$Exposure) - (0.2638*mean(analysisdata$Cpred)) - (5.0991*mean(analysisdata$Poverty)) # get se.fit by hand newdata2<-cbind(rep(1,length(analysisdata$Cpred)), analysisdata$Exposure, rep(mean(analysisdata$Cpred),length(analysisdata$Cpred)), rep(mean(analysisdata$Poverty),length(analysisdata$Cpred))) std.er<-rep(0,length(analysisdata$Cpred)) for (i in 1:length(analysisdata$Cpred)) { C<-newdata2[i,] std.er[i] <- sqrt(t(C) %*% vcov(p1) %*% C) } LL <- Predicted - (1.96 * std.er) UL <- Predicted + (1.96 * std.er) hyp_newdata2b <- as.data.frame( cbind(analysisdata$Exposure,Predicted, LL, UL)) names(hyp_newdata2b)<-c("Exposure","PredictedProb","LL","UL") tiff(filename = "~/output.tiff", type = "cairo" , res=300, height = 4, width = 4, units = "in") ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + geom_line(aes(), size = 1) + scale_y_continuous("Predicted Proportion Hypertension", #breaks = c(.4, .5 ,.6, .7, .8, .9), #limits=c(-150, 250) ) + theme_minimal(base_size = 14) + scale_x_continuous("Average day-night noise level (dBA)", #breaks=c(40, 50,60,70), #labels=c("40","50","60","70") ) # PLOT FOR TRUE C # get predicted values by hand Predicted<-0.6115 - (3.6041*analysisdata$Exposure) + (7.4199*mean(analysisdata$C)) + (6.6686*mean(analysisdata$Poverty)) # get se.fit by hand newdata2<-cbind(rep(1,length(analysisdata$C)), analysisdata$Exposure, rep(mean(analysisdata$C),length(analysisdata$C)), rep(mean(analysisdata$Poverty),length(analysisdata$C))) std.er<-rep(0,length(analysisdata$Cpred)) for (i in 1:length(analysisdata$Cpred)) { C<-newdata2[i,] std.er[i] <- sqrt(t(C) %*% vcov(p1) %*% C) } LL <- Predicted - (1.96 * std.er) UL <- Predicted + (1.96 * std.er) hyp_newdata2b <- as.data.frame( cbind(analysisdata$Exposure,Predicted, LL, UL)) names(hyp_newdata2b)<-c("Exposure","PredictedProb","LL","UL") tiff(filename = "~/output.tiff", type = "cairo" , res=300, height = 4, width = 4, units = "in") ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + geom_line(aes(), size = 1) + scale_y_continuous("Predicted Proportion Hypertension", #breaks = c(.4, .5 ,.6, .7, .8, .9), #limits=c(-150, 250) ) + theme_minimal(base_size = 14) + scale_x_continuous("Average day-night noise level (dBA)", #breaks=c(40, 50,60,70), #labels=c("40","50","60","70") ) dev.off()