library(tidyverse)
expit <- function(x) { exp(x)/(1+exp(x)) }


##########################################################################

# When there is no association between X and Y there can be an apparent 
# association between X and Ypred 
##########################################################################

set.seed(1) 
	
# Generate auxiliary data 

	N<-10000

	#Generate exposure 
		
		Exposure<-runif(N,40,70)

	# Generate non-collider covariates that will predict Y
	
		Homeowner<-rbinom(N,1,.52)
		Population<-rnorm(N,3976,6765)
		NO2<-rnorm(N,10.1,4.5)
		PM25<-rnorm(N,9.1,1.9)	
		
	# Generate collider covariates to go into Ypred model
		# Poverty is strong collider between Homeowner and Exposure

		PovertyPredictor<-(1-Homeowner)*Exposure+.1*(Homeowner)*Exposure
		Poverty<-rbinom(N,1,PovertyPredictor/max(PovertyPredictor))
		
		Race<-rmultinom(N,1,prob=c(.2,.45,1-.65))
   		White<-Race[2,]
		Black<-Race[1,]
		Female<-rbinom(N,1,.45)
		Old<-rbinom(N,1,.05)

	#Generate Y based on Homeowner but not Poverty so that the path between Exposure and Y is collided
             
    	   	Y<-rbinom(N,1,(1-Homeowner)*.7+Homeowner*.3)
    
	#Save data
    
   	 	auxiliarydata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female, Old))
    
    
#Overwrite this to generate analysis data 

	N<-1000
	
	#Generate exposure 
	
		Exposure<-runif(N)

	# Generate non-collider covariates that will predict Y
	
		Homeowner<-rbinom(N,1,.52)
		Population<-rnorm(N,3976,6765)
		NO2<-rnorm(N,10.1,4.5)
		PM25<-rnorm(N,9.1,1.9)	
		
	# Generate collider covariates to go into Ypred model
		# Poverty is strong collider between Homeowner and Exposure

		PovertyPredictor<-(1-Homeowner)*Exposure+.1*(Homeowner)*Exposure
		Poverty<-rbinom(N,1,PovertyPredictor/max(PovertyPredictor))
		
		Race<-rmultinom(N,1,prob=c(.2,.45,1-.65))
   		White<-Race[2,]
		Black<-Race[1,]
		Female<-rbinom(N,1,.45)
		Old<-rbinom(N,1,.05)

	#Generate Y based on Homeowner but not Poverty so that the path between Exposure and Y is collided
             
    	   	Y<-rbinom(N,1,(1-Homeowner)*.7+Homeowner*.3)

    
	#Save data
    
   	 	analysisdata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female, Old))
    
  
# Create analysis data outcome variable

	#Predict Y using auxiliary data

		Model1<-glm(Y~Poverty+Black+White+Female+Old, family=binomial, data=auxiliarydata)
		
	#Get fitted values for analysis data
		
		Ypred<-predict(Model1,newdata=analysisdata,type="response")
			

#Analyze data

	# Unadjusted model shows strong association
	
	p1<-glm(Ypred~Exposure, data=analysisdata, family=binomial())		
	summary(p1)
	
	# Using Y instead of Ypred, the true association is null
	
	p1true<-glm(Y~Exposure, data=analysisdata, family=binomial())		
	summary(p1true)


# Figure using Ypred

	hyp_newdata1b <- with(analysisdata, data.frame(Exposure))
                                            
	hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1, newdata = hyp_newdata1b, 
                                              type = "link", se = TRUE))
	hyp_newdata2b <- within(hyp_newdata2b, {
			  PredictedProb <- plogis(fit)
			  LL <- plogis(fit - (1.96 * se.fit))
			  UL <- plogis(fit + (1.96 * se.fit))
			})


		ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + 
		geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + 
 		geom_line(aes(), size = 1) +
		scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.3,.35, .4 ,.45, .5, .55,.6), limits=c(.3, .6)) +
		theme_minimal(base_size = 14) +
		scale_x_continuous("Average day-night noise level (dBA)",breaks=c(0,0.33,0.67,1),labels=c("40","50","60","70"))
		
		
# Figure using the true Y 

	hyp_newdata1b <- with(analysisdata, data.frame(Exposure))
                                            
	hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1true, newdata = hyp_newdata1b, 
                                              type = "link", se = TRUE))
	hyp_newdata2b <- within(hyp_newdata2b, {
			  PredictedProb <- plogis(fit)
			  LL <- plogis(fit - (1.96 * se.fit))
			  UL <- plogis(fit + (1.96 * se.fit))
			})

		ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + 
		geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + 
 		geom_line(aes(), size = 1) +
		scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.3,.35, .4 ,.45, .5, .55,.6), limits=c(.3, .6)) +
		theme_minimal(base_size = 14) +
		scale_x_continuous("Average day-night noise level (dBA)",breaks=c(0,0.33,0.67,1),labels=c("40","50","60","70"))
		
		dev.off()


##########################################################################

# When there is an association between X and Y the association between 
# X and Ypred can go in the opposite direction or be null 
##########################################################################	

set.seed(1)
	
# Generate auxiliary data 

	N<-10000

	# Generate covariates
	# Poverty will be a confounder
	
	    Race<-rmultinom(N,1,prob=c(.2,.45,1-.65))
   		White<-Race[2,]
		Black<-Race[1,]
		Poverty<-rbinom(N,1,.3*Black+.08*White)
		Homeowner<-rbinom(N,1,.1*Poverty*Black+.2*Poverty+.2)
		Population<-rnorm(N,3976,6765)
		NO2<-rnorm(N,10.1,4.5)
		PM25<-rnorm(N,9.1,1.9)
		Female<-rbinom(N,1,.45)
		Old<-rbinom(N,1,.05)		
	
	#Generate exposure, positive relationship with Poverty
		exp_mean<-Poverty*60+(1-Poverty)*52
		Exposure<-rnorm(N,exp_mean,4)

	#Generate Y, negatively associated with Poverty and positively associated with Exposure
#		Ypredictor<-Poverty*(Exposure-20)+(1-Poverty)*(Exposure+20)
		Ypredictor<-Poverty*(Exposure-15)+(1-Poverty)*(Exposure+15)
  		Y<-rbinom(N,1,Ypredictor/max(Ypredictor))
    
	#Save data
    
   	 	auxiliarydata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female,Old))
    
    
#Overwrite this to generate analysis data 

	N<-1000
	
	# Generate covariates
	# Poverty will be a confounder
	
	    Race<-rmultinom(N,1,prob=c(.2,.45,1-.65))
   		White<-Race[2,]
		Black<-Race[1,]
		Poverty<-rbinom(N,1,.3*Black+.08*White)
		Homeowner<-rbinom(N,1,.1*Poverty*Black+.2*Poverty+.2)
		Population<-rnorm(N,3976,6765)
		NO2<-rnorm(N,10.1,4.5)
		PM25<-rnorm(N,9.1,1.9)
		Female<-rbinom(N,1,.45)
		Old<-rbinom(N,1,.05)
		
	
	#Generate exposure, positive relationship with Poverty
		exp_mean<-Poverty*60+(1-Poverty)*50
		Exposure<-rnorm(N,exp_mean,5)

	#Generate Y, negatively associated with Poverty and positively associated with Exposure
#		Ypredictor<-Poverty*(Exposure-20)+(1-Poverty)*(Exposure+20)
		Ypredictor<-Poverty*(Exposure-15)+(1-Poverty)*(Exposure+15)
  		Y<-rbinom(N,1,Ypredictor/max(Ypredictor))
   
    
	#Save data
    
   	 	analysisdata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female, Old))
    
  
# Create analysis data outcome variable

	#Predict Y using auxiliary data

		Model1<-glm(Y~Poverty+Black+White+Female+Old, family=binomial, data=auxiliarydata)
		
	#Get fitted values for analysis data
		
		Ypred<-predict(Model1,newdata=analysisdata,type="response")
			

#Analyze data

	# Unadjusted model shows strong association
	
	p1<-glm(Ypred~Exposure, data=analysisdata, family=binomial())		
	summary(p1)
	
	# Using Y instead of Ypred, the true association is null
	
	p1true<-glm(Y~Exposure, data=analysisdata, family=binomial())		
	summary(p1true)


# plot for Ypred

	hyp_newdata1b <- with(analysisdata, data.frame(Exposure))
                                            
	hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1, newdata = hyp_newdata1b, 
                                              type = "link", se = TRUE))
	hyp_newdata2b <- within(hyp_newdata2b, {
			  PredictedProb <- plogis(fit)
			  LL <- plogis(fit - (1.96 * se.fit))
			  UL <- plogis(fit + (1.96 * se.fit))
			})

	tiff(filename = "~/output.tiff", type = "cairo" ,  res=300, height = 4, width = 4, units = "in")
		ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + 
		geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + 
 		geom_line(aes(), size = 1) +
		scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.4, .5 ,.6, .7, .8, .9), limits=c(0.4, 1)) +
		theme_minimal(base_size = 14) +
		scale_x_continuous("Average day-night noise level (dBA)",breaks=c(40, 50,60,70),labels=c("40","50","60","70"))
		

		dev.off()
	
	
# plot for true Y

	hyp_newdata1b <- with(analysisdata, data.frame(Exposure))
                                            
	hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1true, newdata = hyp_newdata1b, 
                                              type = "link", se = TRUE))
	hyp_newdata2b <- within(hyp_newdata2b, {
			  PredictedProb <- plogis(fit)
			  LL <- plogis(fit - (1.96 * se.fit))
			  UL <- plogis(fit + (1.96 * se.fit))
			})

	tiff(filename = "~/output.tiff", type = "cairo" ,  res=300, height = 4, width = 4, units = "in")
		ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + 
		geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + 
 		geom_line(aes(), size = 1) +
		scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.4, .5 ,.6, .7, .8, .9), limits=c(0.4, 1)) +
		theme_minimal(base_size = 14) +
		scale_x_continuous("Average day-night noise level (dBA)",breaks=c(40, 50,60,70),labels=c("40","50","60","70"))
		

		dev.off()	
	
	
##########################################################################

# There is a strong association between Exposure and true Y, but the relationship 
# between Exposure and Ypred is null--even though there are no confounders 
##########################################################################	
	
set.seed(1)
	
# Generate auxiliary data 

	N<-10000

	# Generate covariates
	
	    Race<-rmultinom(N,1,prob=c(.2,.45,1-.65))
   		White<-Race[2,]
		Black<-Race[1,]
		Poverty<-rbinom(N,1,.3*Black+.08*White)
		Homeowner<-rbinom(N,1,.1*Poverty*Black+.2*Poverty+.2)
		Population<-rnorm(N,3976,6765)
		NO2<-rnorm(N,10.1,4.5)
		PM25<-rnorm(N,9.1,1.9)
		Female<-rbinom(N,1,.45)
		Old<-rbinom(N,1,.05)		
	
	#Generate exposure independent of covariates--so there is no confounding
		Exposure<-rnorm(N,55,7)


	#Generate Y based on covariates above, and exposure
       Error<-rnorm(N,2,1)  
	   Predictor<-White+Black+Poverty+Homeowner+NO2/max(NO2)+PM25/max(PM25)+Old+Female+
	   				Population/max(Population)+ 10*Exposure/max(Exposure)
	   Probability<-(abs(Predictor/max(Predictor)))
       Y<-rbinom(N,1,prob=Probability)
    
	#Save data
    
   	 	auxiliarydata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female,Old))
    
    
#Overwrite this to generate analysis data 

	N<-1000
	
	# Generate covariates
	
	    Race<-rmultinom(N,1,prob=c(.2,.45,1-.65))
   		White<-Race[2,]
		Black<-Race[1,]
		Poverty<-rbinom(N,1,.3*Black+.08*White)
		Homeowner<-rbinom(N,1,.1*Poverty*Black+.2*Poverty+.2)
		Population<-rnorm(N,3976,6765)
		NO2<-rnorm(N,10.1,4.5)
		PM25<-rnorm(N,9.1,1.9)
		Female<-rbinom(N,1,.45)
		Old<-rbinom(N,1,.05)		
	
	#Generate exposure independent of covariates--so there is no confounding
		Exposure<-rnorm(N,55,5)


	#Generate Y based on covariates above, and exposure
       Error<-rnorm(N,2,1)  
	   Predictor<-White+Black+Poverty+Homeowner+NO2/max(NO2)+PM25/max(PM25)+Old+Female+
	   				Population/max(Population) + 10*Exposure/max(Exposure)
	   Probability<-(abs(Predictor/max(Predictor)))
       Y<-rbinom(N,1,prob=Probability)   

	#Save data
    
   	 	analysisdata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female, Old))
    
  
# Create analysis data outcome variable

	#Predict Y using auxiliary data

		Model1<-glm(Y~Poverty+Black+White+Female+Old, family=binomial, data=auxiliarydata)
		
	#Get fitted values for analysis data
		
		Ypred<-predict(Model1,newdata=analysisdata,type="response")
			

#Analyze data

	# Unadjusted model shows strong association
	
	p1<-glm(Ypred~Exposure, data=analysisdata, family=binomial())		
	summary(p1)
	
	# Using Y instead of Ypred, the true association is null
	
	p1true<-glm(Y~Exposure, data=analysisdata, family=binomial())		
	summary(p1true)

	
# plot for Ypred

	hyp_newdata1b <- with(analysisdata, data.frame(Exposure))
                                            
	hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1, newdata = hyp_newdata1b, 
                                              type = "link", se = TRUE))
	hyp_newdata2b <- within(hyp_newdata2b, {
			  PredictedProb <- plogis(fit)
			  LL <- plogis(fit - (1.96 * se.fit))
			  UL <- plogis(fit + (1.96 * se.fit))
			})

	tiff(filename = "~/output.tiff", type = "cairo" ,  res=300, height = 4, width = 4, units = "in")
		ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + 
		geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + 
 		geom_line(aes(), size = 1) +
		scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.4, .5 ,.6, .7, .8, .9), limits=c(0.4, 1)) +
		theme_minimal(base_size = 14) +
		scale_x_continuous("Average day-night noise level (dBA)",breaks=c(40, 50,60,70),labels=c("40","50","60","70"))
		

		dev.off()
	
	
# plot for true Y

	hyp_newdata1b <- with(analysisdata, data.frame(Exposure))
                                            
	hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1true, newdata = hyp_newdata1b, 
                                              type = "link", se = TRUE))
	hyp_newdata2b <- within(hyp_newdata2b, {
			  PredictedProb <- plogis(fit)
			  LL <- plogis(fit - (1.96 * se.fit))
			  UL <- plogis(fit + (1.96 * se.fit))
			})

	tiff(filename = "~/output.tiff", type = "cairo" ,  res=300, height = 4, width = 4, units = "in")
		ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + 
		geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + 
 		geom_line(aes(), size = 1) +
		scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.4, .5 ,.6, .7, .8, .9), limits=c(0.4, 1)) +
		theme_minimal(base_size = 14) +
		scale_x_continuous("Average day-night noise level (dBA)",breaks=c(40, 50,60,70),labels=c("40","50","60","70"))
		

		dev.off()	
	
	
##########################################################################

# When there is an association between Ypred and Exposure, it becomes null 
# when controlling for variables in the Ypred model

##########################################################################	

set.seed(1)

	
# Generate auxiliary data 

	N<-10000

	# Generate covariates
	
	    Race<-rmultinom(N,1,prob=c(.2,.45,1-.65))
   		White<-Race[2,]
		Black<-Race[1,]
		Poverty<-rbinom(N,1,.3*Black+.08*White)
		Homeowner<-rbinom(N,1,.1*Poverty*Black+.2*Poverty+.2)
		Population<-rnorm(N,3976,6765)
		NO2<-rnorm(N,10.1,4.5)
		PM25<-rnorm(N,9.1,1.9)
		Female<-rbinom(N,1,.45)
		Old<-rbinom(N,1,.05)		
	
	#Generate exposure based on Poverty
		exp_mean<-Poverty*60+(1-Poverty)*52
		Exposure<-rnorm(N,exp_mean,4)	


	#Generate Y based on Black, White and Exposure (no common causes with exposure so no confounding)
       Error<-rnorm(N,2,1)  
	   Predictor<-White+Black+Exposure/max(Exposure)
	   Probability<-(abs(Predictor/max(Predictor)))
       Y<-rbinom(N,1,prob=Probability)
    
	#Save data
    
   	 	auxiliarydata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female,Old))
    
    
#Overwrite this to generate analysis data 

	N<-1000
	
	# Generate covariates
	
	    Race<-rmultinom(N,1,prob=c(.2,.45,1-.65))
   		White<-Race[2,]
		Black<-Race[1,]
		Poverty<-rbinom(N,1,.3*Black+.08*White)
		Homeowner<-rbinom(N,1,.1*Poverty*Black+.2*Poverty+.2)
		Population<-rnorm(N,3976,6765)
		NO2<-rnorm(N,10.1,4.5)
		PM25<-rnorm(N,9.1,1.9)
		Female<-rbinom(N,1,.45)
		Old<-rbinom(N,1,.05)		
	
	
	#Generate exposure based on Poverty
		exp_mean<-Poverty*60+(1-Poverty)*52
		Exposure<-rnorm(N,exp_mean,4)	


	#Generate Y based on Black, White and Exposure (no common causes with exposure so no confounding)
       Error<-rnorm(N,2,1)  
	   Predictor<-White+Black+Exposure/max(Exposure)
	   Probability<-(abs(Predictor/max(Predictor)))
       Y<-rbinom(N,1,prob=Probability)
     

	#Save data
    
   	 	analysisdata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female, Old))
    
  
# Create analysis data outcome variable

	#Predict Y using auxiliary data

		Model1<-glm(Y~Poverty+Black+White+Female+Old, family=binomial, data=auxiliarydata)
		
	#Get fitted values for analysis data
		
		Ypred<-predict(Model1,newdata=analysisdata,type="response")
			

#Analyze data

	# Unadjusted model shows strong association
	
	p1<-glm(Ypred~Exposure, data=analysisdata, family=binomial())		
	summary(p1)
	
	p2<-glm(Ypred~Exposure+Black+White, data=analysisdata, family=binomial())
	summary(p2)
	
	# Using Y instead of Ypred, the true association is null
	
	p1true<-glm(Y~Exposure, data=analysisdata, family=binomial())		
	summary(p1true)

	p2true<-glm(Y~Exposure+Black+White, data=analysisdata, family=binomial())
	summary(p2true)
	
	
# plot for Ypred unadjusted

	hyp_newdata1b <- with(analysisdata, data.frame(Exposure))
                                            
	hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1, newdata = hyp_newdata1b, 
                                              type = "link", se = TRUE))
	hyp_newdata2b <- within(hyp_newdata2b, {
			  PredictedProb <- plogis(fit)
			  LL <- plogis(fit - (1.96 * se.fit))
			  UL <- plogis(fit + (1.96 * se.fit))
			})

	tiff(filename = "~/output.tiff", type = "cairo" ,  res=300, height = 4, width = 4, units = "in")
		ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + 
		geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + 
 		geom_line(aes(), size = 1) +
		scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.4, .5 ,.6, .7, .8, .9), limits=c(0.4, 1)) +
		theme_minimal(base_size = 14) +
		scale_x_continuous("Average day-night noise level (dBA)",breaks=c(40, 50,60,70),labels=c("40","50","60","70"))
		

		dev.off()
	
	
# plot for Ypred adjusted

	hyp_newdata1b <- with(analysisdata, data.frame(Exposure,Black=mean(Black),White=mean(White)))
                                            
	hyp_newdata2b <- cbind(hyp_newdata1b, predict(p2, newdata = hyp_newdata1b, 
                                              type = "link", se = TRUE))
	hyp_newdata2b <- within(hyp_newdata2b, {
			  PredictedProb <- plogis(fit)
			  LL <- plogis(fit - (1.96 * se.fit))
			  UL <- plogis(fit + (1.96 * se.fit))
			})

	tiff(filename = "~/output.tiff", type = "cairo" ,  res=300, height = 4, width = 4, units = "in")
		ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + 
		geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + 
 		geom_line(aes(), size = 1) +
		scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.4, .5 ,.6, .7, .8, .9), limits=c(0, 1)) +
		theme_minimal(base_size = 14) +
		scale_x_continuous("Average day-night noise level (dBA)",breaks=c(40, 50,60,70),labels=c("40","50","60","70"))
		

		dev.off()
	

# plot for true Y adjusted

	hyp_newdata1b <- with(analysisdata, data.frame(Exposure,Black=mean(Black),White=mean(White)))
                                            
	hyp_newdata2b <- cbind(hyp_newdata1b, predict(p2true, newdata = hyp_newdata1b, 
                                              type = "link", se = TRUE))
	hyp_newdata2b <- within(hyp_newdata2b, {
			  PredictedProb <- plogis(fit)
			  LL <- plogis(fit - (1.96 * se.fit))
			  UL <- plogis(fit + (1.96 * se.fit))
			})

	tiff(filename = "~/output.tiff", type = "cairo" ,  res=300, height = 4, width = 4, units = "in")
		ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + 
		geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + 
 		geom_line(aes(), size = 1) +
		scale_y_continuous("Predicted Proportion Hypertension", breaks = c(.4, .5 ,.6, .7, .8, .9), limits=c(0, 1)) +
		theme_minimal(base_size = 14) +
		scale_x_continuous("Average day-night noise level (dBA)",breaks=c(40, 50,60,70),labels=c("40","50","60","70"))
		

		dev.off()	
		
		
##########################################################################

# When there is no association between X and Y there can be an apparent 
# association between Xpred and Y

##########################################################################

set.seed(1) 

# Generate auxiliary data 

	N<-10000

	#Generate outcome 
		
		Y<-rnorm(N)

	# Generate non-collider covariates that will predict Exposure
	
		Homeowner<-rbinom(N,1,.52)
		Population<-rnorm(N,3976,6765)
		NO2<-rnorm(N,10.1,4.5)
		PM25<-rnorm(N,9.1,1.9)	
		
	# Generate collider covariates to go into ExpPred model
		# Poverty is strong collider between Homeowner and Y

		PovertyPredictor<-(1-Homeowner)*Y+.1*(Homeowner)*Y
		Poverty<-rnorm(N,PovertyPredictor,1)
		
		Race<-rmultinom(N,1,prob=c(.2,.45,1-.65))
   		White<-Race[2,]
		Black<-Race[1,]
		Female<-rbinom(N,1,.45)
		Old<-rbinom(N,1,.05)

	#Generate Exposure based on Homeowner but not Poverty so that the path between Exposure and Y is collided
             
    	   	Exposure<-rnorm(N,(1-Homeowner)*.7+Homeowner*.3,1)
    
	#Save data
    
   	 	auxiliarydata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female, Old))
    
    
#Overwrite this to generate analysis data 

	N<-1000
	

	#Generate outcome 
		
		Y<-rnorm(N)

	# Generate non-collider covariates that will predict Exposure
	
		Homeowner<-rbinom(N,1,.52)
		Population<-rnorm(N,3976,6765)
		NO2<-rnorm(N,10.1,4.5)
		PM25<-rnorm(N,9.1,1.9)	
		
	# Generate collider covariates to go into ExpPred model
		# Poverty is strong collider between Homeowner and Y

		PovertyPredictor<-(1-Homeowner)*Y+.1*(Homeowner)*Y
		Poverty<-rnorm(N,PovertyPredictor,1)
		
		Race<-rmultinom(N,1,prob=c(.2,.45,1-.65))
   		White<-Race[2,]
		Black<-Race[1,]
		Female<-rbinom(N,1,.45)
		Old<-rbinom(N,1,.05)

	#Generate Exposure based on Homeowner but not Poverty so that the path between Exposure and Y is collided
             
    	   	Exposure<-rnorm(N,(1-Homeowner)*.7+Homeowner*.3,1)
    
	#Save data
    
   	 	analysisdata<-as.data.frame(cbind(Y,Exposure,Population,NO2,PM25,Poverty,Black,White,Female, Old))
    
  
# Create analysis data outcome variable

	#Predict Exposure using auxiliary data

		Model1<-glm(Exposure~Poverty+Black+White+Female+Old, family=gaussian, data=auxiliarydata)
		
	#Get fitted values for analysis data
		
		analysisdata$ExpPred<-predict(Model1,newdata=analysisdata,type="response")
			

#Analyze data

	# Unadjusted model shows strong association
	
	p1<-glm(Y~ExpPred, data=analysisdata, family=gaussian)		
	summary(p1)
	
	# Using Y instead of Ypred, the true association is null
	
	p1true<-glm(Y~Exposure, data=analysisdata, family=gaussian)		
	summary(p1true)
	
	
	analysisdata$predictY<-predict(p1,newdata=analysisdata,type="response")
	analysisdata$predictYtrue<-predict(p1true, newdata=analysisdata,type="response")
	
	plot(ExpPred,predictY,data=analysisdata)
	plot(Exposure,predictYtrue,data=analysisdata)	
	

# Figure using ExpPred


	hyp_newdata1b <- with(analysisdata, data.frame(ExpPred))
                                            
	hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1, newdata = hyp_newdata1b, 
                                              type = "response", se = TRUE))
	hyp_newdata2b <- within(hyp_newdata2b, {
			  PredictedProb <-(fit)
			  LL <-(fit - (1.96 * se.fit))
			  UL <-(fit + (1.96 * se.fit))
			})


		ggplot(hyp_newdata2b, aes(x = ExpPred, y = fit)) + 
		geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + 
 		geom_line(aes(), size = 1) +
		scale_y_continuous("Predicted Proportion Hypertension",
		# breaks = c(.3,.35, .4 ,.45, .5, .55,.6),
		#limits=c(-1, 2)
		) +
		theme_minimal(base_size = 14) +
		scale_x_continuous("Average day-night noise level (dBA)",
		#breaks=c(0,0.33,0.67,1),labels=c("40","50","60","70")
		)
		
		
# Figure using the true Exposure 

	hyp_newdata1b <- with(analysisdata, data.frame(Exposure))
                                            
	hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1true, newdata = hyp_newdata1b, 
                                              type = "response", se = TRUE))
	hyp_newdata2b <- within(hyp_newdata2b, {
			  PredictedProb <-(fit)
			  LL <-(fit - (1.96 * se.fit))
			  UL <-(fit + (1.96 * se.fit))
			})

		ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + 
		geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + 
 		geom_line(aes(), size = 1) +
		scale_y_continuous("Predicted Proportion Hypertension", 
		#breaks = c(.3,.35, .4 ,.45, .5, .55,.6), 
		limits=c(-1, 2)
		) +
		theme_minimal(base_size = 14) +
		scale_x_continuous("Average day-night noise level (dBA)",
		#breaks=c(0,0.33,0.67,1),labels=c("40","50","60","70")
		)
		
		dev.off()


##########################################################################

# There is a null association between X and Y conditional on C (it appears negative but not significant), 
# but it is positive and significant conditional on Cpred 
##########################################################################	

set.seed(1)

# Generate auxiliary data 

	N<-10000

	# Generate covariates
	# C is a confounder	and is caused by poverty, which is not a confounder
	# The direction of C confounding is positive, masking a truly null relationship between X and Y
	
	    Race<-rmultinom(N,1,prob=c(.2,.45,1-.65))
   		White<-Race[2,]
		Black<-Race[1,]
		Poverty<-rbinom(N,1,.3*Black + .08*White)
		Female<-rbinom(N,1,.45)
		Old<-rbinom(N,1,.05)		
		
		Cmean<-5*Poverty+10*(1-Poverty)+10*rnorm(N)
		C<-rnorm(N,Cmean,5)
	
	#Generate exposure, positive relationship with Poverty
		#Epredictor<-
		Exposure<-rnorm(N,C,5)

	#Generate Y, negatively associated with Poverty and positively associated with Exposure
		Ymean<- C 
  		Y<-rnorm(N,C,5)
    
    
	#Save data
    
   	 	auxiliarydata<-as.data.frame(cbind(Y,Exposure,C, Poverty,Black,White,Female,Old))
    
    
#Overwrite this to generate analysis data 

	N<-1000
	
	# Generate covariates
	# C is a confounder	and is caused by poverty, which is not a confounder
	# The direction of C confounding is positive, masking a truly null relationship between X and Y
	
	    Race<-rmultinom(N,1,prob=c(.2,.45,1-.65))
   		White<-Race[2,]
		Black<-Race[1,]
		Poverty<-rbinom(N,1,.3*Black + .08*White)
		Female<-rbinom(N,1,.45)
		Old<-rbinom(N,1,.05)		
		
		Cmean<-5*Poverty+10*(1-Poverty)+10*rnorm(N)
		C<-rnorm(N,Cmean,5)
	
	#Generate exposure, positive relationship with Poverty
		#Epredictor<-
		Exposure<-rnorm(N,C,5)

	#Generate Y, negatively associated with Poverty and positively associated with Exposure
		Ymean<- C 
  		Y<-rnorm(N,C,5)
    
       
	#Save data
    
   	 	analysisdata<-as.data.frame(cbind(Y,Exposure,C, Poverty,Black,White,Female,Old))
    
  
# Create analysis data outcome variable

	#Predict C using auxiliary data

		Model1<-glm(C~Poverty+Black+White+Female+Old, family=gaussian, data=auxiliarydata)
		
	#Get fitted values for analysis data
		
		analysisdata$Cpred<-predict(Model1,newdata=analysisdata,type="response")
			

#Analyze data

	# Model with Cpred shows association
	
	p1<-glm(Y~Exposure+Cpred, data=analysisdata, family=gaussian)		
	summary(p1)
	
	# Using C instead of Cpred, the true association is null
	
	p1true<-glm(Y~Exposure+C, data=analysisdata, family=gaussian)		
	summary(p1true)


	analysisdata$predictY<-predict(p1,newdata=analysisdata,type="response")
	analysisdata$predictYtrue<-predict(p1true, newdata=analysisdata,type="response")
	
	plot(Exposure,predictY,data=analysisdata)
	plot(Exposure,predictYtrue,data=analysisdata)	
	

# plot for Cpred

	hyp_newdata1b <- as.data.frame(cbind(analysisdata$Exposure,rep(mean(analysisdata$Cpred),length(analysisdata$Cpred))))
	names(hyp_newdata1b)<-c("Exposure","Cpred")
                                            
	hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1, newdata = hyp_newdata1b, 
                                              type = "response", se = TRUE))
	hyp_newdata2b <- within(hyp_newdata2b, {
			  PredictedProb <- fit
			  LL <- fit - (1.96 * se.fit)
			  UL <- fit + (1.96 * se.fit)
			})
			
	plot(hyp_newdata2b$Exposure,hyp_newdata2b$PredictedProb)

	tiff(filename = "~/output.tiff", type = "cairo" ,  res=300, height = 4, width = 4, units = "in")
		ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + 
		geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + 
 		geom_line(aes(), size = 1) +
		scale_y_continuous("Predicted Proportion Hypertension", #breaks = c(.4, .5 ,.6, .7, .8, .9), 
		#limits=c(-150, 250)
		) +
		theme_minimal(base_size = 14) +
		scale_x_continuous("Average day-night noise level (dBA)",
		#breaks=c(40, 50,60,70),
		#labels=c("40","50","60","70")
		)
		

		dev.off()
	
	
# plot for true C

	hyp_newdata1b <- as.data.frame(cbind(analysisdata$Exposure,rep(mean(analysisdata$C),length(analysisdata$C))))
	names(hyp_newdata1b)<-c("Exposure","C")

                                            
	hyp_newdata2b <- cbind(hyp_newdata1b, predict(p1true, newdata = hyp_newdata1b, 
                                              type = "response", se = TRUE))
	hyp_newdata2b <- within(hyp_newdata2b, {
			  PredictedProb <-(fit)
			  LL <-(fit - (1.96 * se.fit))
			  UL <-(fit + (1.96 * se.fit))
			})
			
	plot(hyp_newdata2b$Exposure,hyp_newdata2b$PredictedProb)

	tiff(filename = "~/output.tiff", type = "cairo" ,  res=300, height = 4, width = 4, units = "in")
		ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + 
		geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + 
 		geom_line(aes(), size = 1) +
		scale_y_continuous("Predicted Proportion Hypertension", 
		#breaks = c(.4, .5 ,.6, .7, .8, .9), 
		#limits=c(-150, 250)
		) 
		+ theme_minimal(base_size = 14) +
		scale_x_continuous("Average day-night noise level (dBA)"#,breaks=c(40, 50,60,70),labels=c("40","50","60","70")
		)
		

		dev.off()		

	
##########################################################################

# The association between X and Y is significant but in opposite directions controling for C vs Cpred 
##########################################################################	

set.seed(1)
	
# Generate auxiliary data 

	N<-10000

	# Generate covariates
	# C is a confounder	and is caused by poverty, which is also a confounder
	# The direction of C confounding is positive, masking a truly negative relationship between X and Y
	
	    Race<-rmultinom(N,1,prob=c(.2,.45,1-.65))
   		White<-Race[2,]
		Black<-Race[1,]
		Poverty<-rbinom(N,1,.3*Black + .08*White)
		Female<-rbinom(N,1,.45)
		Old<-rbinom(N,1,.05)		
		
		Cmean<-5*Poverty+10*(1-Poverty)+10*rnorm(N)
		C<-rnorm(N,Cmean,10)
	
	#Generate exposure, positive relationship with Poverty
		Epredictor<-(Poverty+C)
		Exposure<-rnorm(N,Epredictor,1)

	#Generate Y, negatively associated with Poverty and positively associated with Exposure
		Ymean<-Poverty + 5*C - Exposure
  		Y<-rnorm(N,Ymean,50)
    
    
	#Save data
    
   	 	auxiliarydata<-as.data.frame(cbind(Y,Exposure,C, Poverty,Black,White,Female,Old))
    
    
#Overwrite this to generate analysis data 

	N<-1000
	
	# Generate covariates
	# C is a confounder	and is caused by poverty, which is also a confounder
	# The direction of C confounding is positive, masking a truly negative relationship between X and Y
	
	    Race<-rmultinom(N,1,prob=c(.2,.45,1-.65))
   		White<-Race[2,]
		Black<-Race[1,]
		Poverty<-rbinom(N,1,.3*Black + .08*White)
		Female<-rbinom(N,1,.45)
		Old<-rbinom(N,1,.05)		
		
		Cmean<-5*Poverty+10*(1-Poverty)+10*rnorm(N)
		C<-rnorm(N,Cmean,10)
	
	#Generate exposure, positive relationship with Poverty
		Epredictor<-(Poverty+C)
		Exposure<-rnorm(N,Epredictor,1)

	#Generate Y, negatively associated with Poverty and positively associated with Exposure
		Ymean<-Poverty + 5*C - Exposure
  		Y<-rnorm(N,Ymean,50)
        
    
	#Save data
    
   	 	analysisdata<-as.data.frame(cbind(Y,Exposure,C, Poverty,Black,White,Female,Old))
    
  
# Create analysis data outcome variable

	#Predict C using auxiliary data

		Model1<-glm(C~Poverty+Black+White+Female+Old, family=gaussian, data=auxiliarydata)
		
	#Get fitted values for analysis data
		
		analysisdata$Cpred<-predict(Model1,newdata=analysisdata,type="response")
		
	
#Analyze data

	# Model with Cpred shows association
	
	p1<-lm(Y~Exposure+Cpred+Poverty, data=analysisdata)		
	summary(p1)
	
	# Using C instead of Cpred, the true association is null
	
	p1true<-lm(Y~Exposure+C+Poverty, data=analysisdata)		
	summary(p1true)
	
	
# PLOT FOR CPred 

# get predicted values by hand
	Predicted<-3.0772 + (3.9508*analysisdata$Exposure) - (0.2638*mean(analysisdata$Cpred)) - 	
    		(5.0991*mean(analysisdata$Poverty))

# get se.fit by hand	
	newdata2<-cbind(rep(1,length(analysisdata$Cpred)), 		
			analysisdata$Exposure,
			rep(mean(analysisdata$Cpred),length(analysisdata$Cpred)),
			rep(mean(analysisdata$Poverty),length(analysisdata$Cpred)))

	std.er<-rep(0,length(analysisdata$Cpred))
	for (i in 1:length(analysisdata$Cpred))	{
	C<-newdata2[i,]
 	std.er[i] <- sqrt(t(C) %*% vcov(p1) %*% C)
 	}

 			 LL <- Predicted - (1.96 * std.er)
			  UL <- Predicted + (1.96 * std.er)

	hyp_newdata2b <- as.data.frame( cbind(analysisdata$Exposure,Predicted, LL, UL))
	names(hyp_newdata2b)<-c("Exposure","PredictedProb","LL","UL")

	tiff(filename = "~/output.tiff", type = "cairo" ,  res=300, height = 4, width = 4, units = "in")
		ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + 
		geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + 
 		geom_line(aes(), size = 1) +
		scale_y_continuous("Predicted Proportion Hypertension", #breaks = c(.4, .5 ,.6, .7, .8, .9), 
		#limits=c(-150, 250)
		) +
		theme_minimal(base_size = 14) +
		scale_x_continuous("Average day-night noise level (dBA)",
		#breaks=c(40, 50,60,70),
		#labels=c("40","50","60","70")
		)
			
	
# PLOT FOR TRUE C 
	
# get predicted values by hand
	Predicted<-0.6115 - (3.6041*analysisdata$Exposure) + (7.4199*mean(analysisdata$C)) + 	
    		(6.6686*mean(analysisdata$Poverty))

# get se.fit by hand	
	newdata2<-cbind(rep(1,length(analysisdata$C)), 		
			analysisdata$Exposure,
			rep(mean(analysisdata$C),length(analysisdata$C)),
			rep(mean(analysisdata$Poverty),length(analysisdata$C)))

	std.er<-rep(0,length(analysisdata$Cpred))
	for (i in 1:length(analysisdata$Cpred))	{
	C<-newdata2[i,]
 	std.er[i] <- sqrt(t(C) %*% vcov(p1) %*% C)
 	}


			 LL <- Predicted - (1.96 * std.er)
			  UL <- Predicted + (1.96 * std.er)

	hyp_newdata2b <- as.data.frame( cbind(analysisdata$Exposure,Predicted, LL, UL))
	names(hyp_newdata2b)<-c("Exposure","PredictedProb","LL","UL")


	tiff(filename = "~/output.tiff", type = "cairo" ,  res=300, height = 4, width = 4, units = "in")
		ggplot(hyp_newdata2b, aes(x = Exposure, y = PredictedProb)) + 
		geom_ribbon(aes(ymin = LL, ymax = UL), alpha = 0.25) + 
 		geom_line(aes(), size = 1) +
		scale_y_continuous("Predicted Proportion Hypertension", #breaks = c(.4, .5 ,.6, .7, .8, .9), 
		#limits=c(-150, 250)
		) +
		theme_minimal(base_size = 14) +
		scale_x_continuous("Average day-night noise level (dBA)",
		#breaks=c(40, 50,60,70),
		#labels=c("40","50","60","70")
		)
		

		dev.off()