#######################################################################-
# useful packages ----
#######################################################################-

library(dplyr)

#######################################################################-
# read in raw data ----
#######################################################################-


# 2x2 table data 
rawdat = textConnection(
"case,exposed,n
1,1,58
1,0,117
0,1,27
0,0,148
", open = "r")
study_2x2 = read.csv(rawdat)

# smoking stratified data, inferred from the text
rawdat_smk = textConnection(
"case,exposed,smoker,n
1,1,1,44
1,0,1,50
0,1,1,20
0,0,1,11
1,1,0,14
1,0,0,67
0,1,0,7
0,0,0,137
", open = "r")

study_2x2x2 = read.csv(rawdat_smk)

close.connection(rawdat)
close.connection(rawdat_smk)

#######################################################################-
# apparent analysis (here, crude for the pseudo-data approach, adjusted for the bias factor approach) ----
#######################################################################-


# apparent odds/risk ratios
(rr_apparent <- exp(glm(case~exposed, weight=n, data = study_2x2, family = binomial(link="logit"))$coefficients[2]))
(rr_smkadjusted <- exp(glm(case~exposed+smoker, weight=n, data = study_2x2x2, family = binomial(link="logit"))$coefficients[2]))

conf_bias =  rr_apparent / rr_smkadjusted # estimate of bias due to confounding by smoking
# confidence intervals
(rr_apparent_ci <- exp(confint(glm(case~exposed, weight=n, data = study_2x2, family = binomial(link="logit")))[2,]))
(rr_smkadjusted_ci <- exp(confint(glm(case~exposed+smoker, weight=n, data = study_2x2x2, family = binomial(link="logit")))[2,]))


#######################################################################-
# useful functions ----
#######################################################################-

# function: g_of_ab ----
# commonly used function to combine risk/odds ratios into a common
# bias factor
g_of_ab <- function(a,b){
  a*b/(a+b-1)
}

# function: boundaway ----
#  useful to avoid division by zero in cases where one wishes
#  to plug in parameter values that indicate no bias
boundaway <- function(x,eps=0.00000001) {
  ifelse(x==1, 1-eps, ifelse(x==0, eps, x))
}

# function: checkdata ----
#  useful to check for implausible cells in pseudo-data
checkdata <- function(data2x2){
  if(any(data2x2$n < 0 )) stop("Bias correction at current parameter values yielded implausible data")
}


#######################################################################-
# correct for selection bias ----
# note that the bounding approach and pseudo-data approach
# use fundamentally different assumption sets and have
# different input parameters
#######################################################################-

##################################-
## Selection bias functions ----
##################################-

## function: selection_biasfactor ----
selection_biasfactor <- function(risky_exposed=c(1,1), 
                                 risky_unexposed=c(1,1), 
                                 prevu_selected_exposed=1, 
                                 prevu_selected_unexposed=1, 
                                 prevu_unselected_exposed=1,
                                 prevu_unselected_unexposed=1){
  # this approach conceptualizes selection bias as being due to a latent, or unmeasured factor
  # see: Smith LH, VanderWeele TJ. Bounding bias due to selection. Epidemiology. 2019;30:509–516.
  
  rr_usy_1 = max(risky_exposed)/min(risky_exposed)
  rr_usy_0 = max(risky_unexposed)/min(risky_unexposed)
  #
  rr_sus_1 = (prevu_selected_exposed)/(prevu_unselected_exposed)
  rr_sus_0 = (prevu_selected_unexposed)/(prevu_selected_unexposed)
  
  rr_sus_1 = max(rr_sus_1, 1/rr_sus_1)
  rr_sus_0 = max(rr_sus_0, 1/rr_sus_0)
  
  bias_factor_selection <- g_of_ab(rr_usy_1, rr_sus_1) * g_of_ab(rr_usy_0, rr_sus_0)
  bias_factor_selection
}

## function: selection_correct ----
selection_correct <- function(data2x2, case_1sel=1, case_0sel=1, cont_1sel=1, cont_0sel=1){
 # inverse probability of selection weight based correction of cell frequencies
  A0 = with(data2x2, (sum(case*exposed*n)/case_1sel))
  B0 = with(data2x2, (sum(case*(1-exposed)*n)/case_0sel))
  C0 = with(data2x2, (sum((1-case)*exposed*n)/cont_1sel))
  D0 = with(data2x2, (sum((1-case)*(1-exposed)*n)/cont_0sel))
  
  data2x2_adj <- data2x2 %>%
    mutate(
      n = case_when(
        case==1 & exposed == 1 ~ A0,
        case==1 & exposed == 0 ~ B0,
        case==0 & exposed == 1 ~ C0,
        case==0 & exposed == 0 ~ D0,
      )
    )
  data2x2_adj
}

##################################-
## Selection bias examples ----
##################################-

## selection bias parameters used in bias-factor example ----
# stratum specific risks
or_female = (93/1077)/(624/2400)
risk_males_exposed = 0.08
risk_males_unexposed = 0.02
risky_exposed = c(risk_males_exposed, risk_males_exposed*or_female)
risky_unexposed = c(risk_males_unexposed, risk_males_unexposed*or_female)
# prevalence of being male, given exposure and selection into study, by Bayes formula
pmale_population = 0.5
popium_male_population = 0.241 # no unrounded values available
popium_female_population = 0.022 # no unrounded values available
popium_male_sample = 0.241 # no unrounded values available

pmale_selected = 153/(153+22)
pselection = (153+22)/(153*2) # marginal probability of selection
popium_population = popium_male_population*pmale_population + popium_female_population*(1-pmale_population)
popium_selected = 27/(27+148)

p_opium_unselected = (popium_population - (pselection*popium_selected))/(1-pselection)


# gives conditional probabilites of being male - the function also
#  uses this to calculate conditional probabilities of being female
prevu_unselected_exposed =  popium_male_population*pmale_population*popium_selected
prevu_unselected_unexposed =  (1-popium_male_population)*pmale_selected*popium_selected
prevu_selected_exposed =  popium_male_population*pmale_selected*popium_selected
prevu_selected_unexposed =  (1-popium_male_population)*pmale_selected*(p_opium_unselected)

## selection bias bounding factor ----
bfs <- selection_biasfactor(risky_exposed=risky_exposed, 
                            risky_unexposed=risky_unexposed, 
                            prevu_selected_exposed=prevu_unselected_exposed, 
                            prevu_selected_unexposed=prevu_unselected_unexposed, 
                            prevu_unselected_exposed=prevu_selected_exposed, 
                            prevu_unselected_unexposed=prevu_selected_unexposed
)


## selection bias parameters used in pseudo-data example ----
pmale_population = 0.5
pfemale_controls = 22/(153+22)
pselection_controls = (153+22)/(153+153) 

popium_male_population = 0.241 # no unrounded values available
popium_female_population = 0.022 # no unrounded values available
popium_male_sample = 0.241 # no unrounded values available
pmale_selected = 153/(153+22)
popium_selected = 27/(27+148)

popium_sexstandardized = popium_male_population*pmale_population + popium_female_population*(1-pmale_population)

#prevu_unselected_exposed =  popium_male_population*pmale_population*popium_selected
#prevu_unselected_unexposed =  popium_male_population*pmale_selected*popium_selected

# probability of selection, given case and exposure status 
# left side of expansion Pr(S=1|Y,A,U)=Pr(S=1|Y,U)
pselection_femalecontrols = pfemale_controls*pselection_controls/(1-pmale_population)
pselection_malecontrols = (1-pfemale_controls)*pselection_controls/(pmale_population)
# right side of expansion Pr(U=u|Y,A)
pfemale_exposedcontrols = popium_female_population*(1-pmale_population)/popium_sexstandardized
pmale_exposedcontrols = popium_male_population*pmale_population/popium_sexstandardized
pfemale_unexposedcontrols = (1-popium_female_population)*(1-pmale_population)/(1-popium_sexstandardized)
pmale_unexposedcontrols = (1-popium_male_population)*pmale_population/(1-popium_sexstandardized)


(exposed_case_selectionprob <- 1.0)
(unexposed_case_selectionprob <- 1.0)
(exposed_control_selectionprob <- pselection_femalecontrols*pfemale_exposedcontrols + pselection_malecontrols*pmale_exposedcontrols)
(unexposed_control_selectionprob <- pselection_femalecontrols*pfemale_unexposedcontrols + pselection_malecontrols*pmale_unexposedcontrols)


## selection bias adjusted pseudo-data ----
data_select_adj = selection_correct(study_2x2, 
                                    case_1sel=exposed_case_selectionprob, 
                                    case_0sel=unexposed_case_selectionprob, 
                                    cont_1sel=exposed_control_selectionprob,
                                    cont_0sel=unexposed_control_selectionprob
)


#######################################################################-
# adjusted analysis (here, only selection bias) ----
#######################################################################-
# adjusted 2x2 
data_select_adj
# adjusted effect measure
(rr_adj<- exp(glm(case~exposed, weight=n, data = data_select_adj, family = binomial(link='logit'))$coefficients[2]))
# pessimistic OR bound
rr_smkadjusted / bfs


#######################################################################-
# 2) correct for misclassification ----
# note that the bounding approach and pseudo-data approach
# use similar input parameters, but which differs from
# other biases
#######################################################################-


## function: misclass_biasfactor ----
misclass_biasfactor <- function(case_sens=1, case_spec=1, cont_sens=1, cont_spec=1){
  G = boundaway(1-case_spec)
  H = boundaway(1-cont_spec)
  I = boundaway(case_sens)
  J = boundaway(cont_sens)
  fpor = (G/H)/((1-G)/(1-H)) # false positive odds ratio
  seor = (I/J)/((1-I)/(1-J)) # sensitivity odds ratio
  ccr = (I/J)/((1-G)/(1-H))  # correct classification odds ratio
  icr = (G/H)/((1-I)/(1-J))  # incorrect classification odds ratio
  or_astary = max(fpor, seor, ccr, icr)
  bias_factor_misclass <- or_astary
  bias_factor_misclass
}

## function: misclass_correct ----
misclass_correct <- function(data2x2, case_sens=1, case_spec=1, cont_sens=1, cont_spec=1){
  # calculate adjusted 2 by 2 table cells under misclassification
  ncases = sum(with(data2x2, n*case))
  ncontrols = sum(with(data2x2, n*(1-case)))
  A0 = with(data2x2, (sum(case*exposed*n) - ncases*(1-case_spec))/(case_sens - 1 + case_spec))
  B0 = ncases - A0
  C0 = with(data2x2, (sum((1-case)*exposed*n) - ncontrols*(1-cont_spec))/(cont_sens - 1 + cont_spec))
  D0 = ncontrols - C0
  data2x2_adj <- data2x2 %>%
    mutate(
      n = case_when(
        case==1 & exposed == 1 ~ A0,
        case==1 & exposed == 0 ~ B0,
        case==0 & exposed == 1 ~ C0,
        case==0 & exposed == 0 ~ D0,
      )
    )
  data2x2_adj
}


## misclassification parameters used in example ----
# from Rashidia et al (2017)
(case_sensitivity <- 38/49) # 77.5%
(case_specificity <- 129/140) # 92% 
(control_sensitivity <- case_sensitivity)
(control_specificity <- case_specificity)


## exposure misclassification bounding factor ----
bfm <- misclass_biasfactor( 
  case_sens = case_sensitivity, 
  case_spec = case_specificity, 
  cont_sens = control_sensitivity, 
  cont_spec = control_specificity
)

## selection, exposure misclassification adjusted pseudo-data ----
data_selectmisclass_adj = misclass_correct(data_select_adj, 
                                     case_sens = case_sensitivity, 
                                     case_spec = case_specificity, 
                                     cont_sens = control_sensitivity, 
                                     cont_spec = control_specificity
)

#######################################################################-
# selection bias, exposure misclassification adjusted analysis  ----
#######################################################################-
# adjusted 2x2 
data_selectmisclass_adj
# adjusted effect measure (safe to ignore warning about non-integer successes)
(rr_adj<- exp(glm(case~exposed, weight=n, data = data_selectmisclass_adj, family = binomial(link = "logit"))$coefficients[2]))
# pessimistic OR bound
rr_smkadjusted / (bfs*bfm)


#######################################################################-
# correct for unmeasured confounding bias ----
#######################################################################-

## function: confounding_biasfactor ----
confounding_biasfactor <- function(risky_exposed=c(1,1), 
                                 risky_unexposed=c(1,1), 
                                 prevu_exposed=1, 
                                 prevu_unexposed=1){
  # this approach conceptualizes selection bias as being due to a latent, or unmeasured factor
  # see: Smith LH, VanderWeele TJ. Bounding bias due to selection. Epidemiology. 2019;30:509–516.
  
  rr_ucy_1 = max(risky_exposed)/min(risky_exposed)
  rr_ucy_0 = max(risky_unexposed)/min(risky_unexposed)
  rr_ucy = max(rr_ucy_1, rr_ucy_0)
  #
  rr_auc = (prevu_exposed)/(prevu_unexposed)
  rr_auc = max(rr_auc, 1/rr_auc)

  bias_factor_confounding <- g_of_ab(rr_ucy, rr_auc)
  bias_factor_confounding
}

## function: confounding_correct ----
confounding_correct <- function(data2x2, 
                                rr_uy=2, 
                                prevu_exposed=1, 
                                prevu_unexposed=.6, 
                                datatype="casecon" # are 2x2 data in case/control format or case/population format?
                                ){
  if(!(tolower(datatype) %in% c("casecon", "cohort")))
    stop("datatype must be 'casecon' or 'cohort'")
  # inverse probability of selection weight based correction of cell frequencies
  a =  with(data2x2, (sum(case*exposed*n)))
  b =  with(data2x2, (sum(case*(1-exposed)*n)))
  c =  with(data2x2, (sum((1-case)*exposed*n)))
  d =  with(data2x2, (sum((1-case)*(1-exposed)*n)))
  m =  with(data2x2, (sum(exposed*n)))      # here, the "exposed" group is really the total
  n =  with(data2x2, (sum((1-exposed)*n)))   # here, the "exposed" group is really the total
  # RR
  if(toupper(datatype) == "cohort"){
    M1 = with(data2x2, (sum(exposed*n)))*prevu_exposed
    M0 = with(data2x2, (sum(exposed*n))) - M1
    N1 = with(data2x2, (sum((1-exposed)*n)))*prevu_unexposed
    N0 = with(data2x2, (sum((1-exposed)*n))) - N1
    A1 = rr_uy * M1 * a / (rr_uy * M1 + m - M1)
    B1 = rr_uy * N1 * b / (rr_uy * N1 + n - N1)
    A0 = a - A1
    B0 = b - B1
    C1 = M1 - A1
    C0 = M0 - A0
    D1 = N1 - B1
    D0 = N0 - B0
  } else{
    C1 = c*prevu_exposed
    C0 = c - C1
    D1 = d*prevu_unexposed
    D0 = d - D1
    A1 = rr_uy * C1 * a / (rr_uy * C1 + c - C1)
    A0 = a - A1
    B1 = rr_uy * D1 * b / (rr_uy * D1 + d - D1)
    B0 = b - B1
  }
  data2x21 = data2x2 %>%
    mutate(
      u=1
    )
  data2x20 = data2x2 %>%
    mutate(
      u=0
    )
  
  data2x2_adj <- rbind(data2x21,data2x20) %>%
    mutate(
      n = case_when(
        case==1 & exposed == 1 & u == 0 ~ A0,
        case==1 & exposed == 0 & u == 0 ~ B0,
        case==0 & exposed == 1 & u == 0 ~ C0,
        case==0 & exposed == 0 & u == 0 ~ D0,
        case==1 & exposed == 1 & u == 1 ~ A1,
        case==1 & exposed == 0 & u == 1 ~ B1,
        case==0 & exposed == 1 & u == 1 ~ C1,
        case==0 & exposed == 0 & u == 1 ~ D1,
      )
    )
  data2x2_adj
}


## selection bias parameters used in bias-factor example ----
# prevalence of being male, given exposure and selection into study, by Bayes formula
pmale_population = 0.5
popium_male_population = 0.241 # no unrounded values available
popium_female_population = 0.022 # no unrounded values available
popium_male_sample = 0.241 # no unrounded values available

popium_population = popium_male_population*pmale_population + popium_female_population*(1-pmale_population)

pmale_exposed = popium_male_population * pmale_population/popium_population
pmale_unexposed = (1-popium_male_population) * pmale_population/(1-popium_population)

## selection bias parameters used in the pseudo-data example ----
# stratum specific risks
or_female = (93/1077)/(624/2400)
risk_males_exposed = 0.08 # arbitrary
risk_males_unexposed = 0.02 # arbitrary
risky_exposed = c(risk_males_exposed, risk_males_exposed*or_female)
risky_unexposed = c(risk_males_unexposed, risk_males_unexposed*or_female)

p_opium_unselected = (popium_population - (pselection*popium_selected))/(1-pselection)


bfc <- confounding_biasfactor(risky_exposed=risky_exposed, 
                              risky_unexposed=risky_unexposed, 
                              prevu_exposed=pmale_exposed, # pfemale_exposed calculated within the function
                              prevu_unexposed=pmale_unexposed
)

bfc

## misclassification + selection bias + confounding adjusted pseudo-data ----
(data_confoundselectmisclass_adj <- confounding_correct(data_selectmisclass_adj, 
                                                      rr_uy=or_female, 
                                                      prevu_exposed=(1-pmale_exposed), 
                                                      prevu_unexposed=(1-pmale_unexposed), 
                                                      datatype="casecon"
))


############################################################-
# adjusted analysis (here, all 3 biases) ----
############################################################-
# adjusted 2x2x2 
data_confoundselectmisclass_adj
# adjusted effect measure
(rr_adj<- exp(glm(case~exposed+u, weight=n, data = data_confoundselectmisclass_adj, family = binomial(link='logit'))$coefficients[2]))
# pessimistic OR bound
rr_smkadjusted / (bfs * bfm * bfc)

############################################################-
# adjusted analysis (here, selection bias and exposure misclassification bias) ----
############################################################-
# adjusted 2x2x2
data_confoundselectmisclass_adj
# adjusted effect measure
(rr_adj<- exp(glm(case~exposed+u, weight=n, data = data_confoundselectmisclass_adj, family = binomial(link='logit'))$coefficients[2]))
# pessimistic lower bound
rr_apparent / (bfs * bfm * bfc)


############################################################-
# approximately re-correct for measured confounding if needed ----
############################################################-
print(rr_adj/conf_bias)


############################################################-
# sensitivity analysis ----
############################################################-


multiple_bias_analysis_pseudodata <- function(data2x2,
                          exposed_case_selectionprob=1, 
                          unexposed_case_selectionprob=1,
                          exposed_control_selectionprob=1,
                          unexposed_control_selectionprob=1,
                          case_sensitivity=1,
                          case_specificity=1,
                          control_sensitivity=1,
                          control_specificity=1,
                          rr_uy=1,
                          prevu_exposed=1,
                          prevu_unexposed=1,
                          conf_bias = 1.0 # bias from measured confounder
){
  # parameter values
  
  data_select_adj = selection_correct(data2x2, 
                                      case_1sel=exposed_case_selectionprob, 
                                      case_0sel=unexposed_case_selectionprob, 
                                      cont_1sel=exposed_control_selectionprob,
                                      cont_0sel=unexposed_control_selectionprob
  )
  checkdata(data_select_adj)

  data_selectmisclass_adj = misclass_correct(data_select_adj, 
                                             case_sens = case_sensitivity, 
                                             case_spec = case_specificity, 
                                             cont_sens = control_sensitivity, 
                                             cont_spec = control_specificity
  )
  checkdata(data_selectmisclass_adj)
  
  data_confoundselectmisclass_adj <- confounding_correct(data_selectmisclass_adj, 
                                                          rr_uy=rr_uy, 
                                                          prevu_exposed=prevu_exposed, 
                                                          prevu_unexposed=prevu_unexposed, 
                                                          datatype = "casecon"
  )
  checkdata(data_confoundselectmisclass_adj)
  
  suppressWarnings(rr_adj<- exp(glm(case~exposed+u, weight=n, data = data_confoundselectmisclass_adj, family = binomial(link='logit'))$coefficients[2]))
  list(estimate = unname(rr_adj/conf_bias), pseudo_data = data_confoundselectmisclass_adj)
}


# original analysis
multiple_bias_analysis_pseudodata(study_2x2, 
                                  # selection bias
                                  exposed_case_selectionprob=1, 
                                  unexposed_case_selectionprob=1,
                                  exposed_control_selectionprob=0.9283779,
                                  unexposed_control_selectionprob=0.5179202,
                                  # exposure misclassification
                                  case_sensitivity=38/49,
                                  case_specificity=129/140,
                                  control_sensitivity=38/49,
                                  control_specificity=129/140,
                                  # confounding
                                  rr_uy=(93/1077)/(624/2400),
                                  prevu_exposed=0.08365019,
                                  prevu_unexposed=0.5630397,
                                  conf_bias = conf_bias
                                  )

# higher specificity
multiple_bias_analysis_pseudodata(study_2x2, 
                                  # selection bias
                                  exposed_case_selectionprob=1, 
                                  unexposed_case_selectionprob=1,
                                  exposed_control_selectionprob=0.9283779,
                                  unexposed_control_selectionprob=0.5179202,
                                  # exposure misclassification
                                  case_sensitivity=38/49,
                                  case_specificity=1.0,
                                  control_sensitivity=38/49,
                                  control_specificity=1.0,
                                  # confounding
                                  rr_uy=(93/1077)/(624/2400),
                                  prevu_exposed=0.08365019,
                                  prevu_unexposed=0.5630397,
                                  conf_bias = conf_bias
)

# differential exposure misclassification (false positives among cases only)
multiple_bias_analysis_pseudodata(study_2x2, 
                                  # selection bias
                                  exposed_case_selectionprob=1, 
                                  unexposed_case_selectionprob=1,
                                  exposed_control_selectionprob=0.9283779,
                                  unexposed_control_selectionprob=0.5179202,
                                  # exposure misclassification
                                  case_sensitivity=38/49,
                                  case_specificity=129/140,
                                  control_sensitivity=38/49,
                                  control_specificity=1,
                                  # confounding
                                  rr_uy=(93/1077)/(624/2400),
                                  prevu_exposed=0.08365019,
                                  prevu_unexposed=0.5630397,
                                  conf_bias = conf_bias
)

# differential exposure misclassification (false positives among controls only)
multiple_bias_analysis_pseudodata(study_2x2, 
                                  # selection bias
                                  exposed_case_selectionprob=1, 
                                  unexposed_case_selectionprob=1,
                                  exposed_control_selectionprob=0.9283779,
                                  unexposed_control_selectionprob=0.5179202,
                                  # exposure misclassification
                                  case_sensitivity=38/49,
                                  case_specificity=1,
                                  control_sensitivity=38/49,
                                  control_specificity=129/140,
                                  # confounding
                                  rr_uy=(93/1077)/(624/2400),
                                  prevu_exposed=0.08365019,
                                  prevu_unexposed=0.5630397,
                                  conf_bias = conf_bias
)

# stronger confounding
#https://pubmed.ncbi.nlm.nih.gov/8275221/
multiple_bias_analysis_pseudodata(study_2x2, 
                                  # selection bias
                                  exposed_case_selectionprob=1, 
                                  unexposed_case_selectionprob=1,
                                  exposed_control_selectionprob=0.9283779,
                                  unexposed_control_selectionprob=0.5179202,
                                  # exposure misclassification
                                  case_sensitivity=38/49,
                                  case_specificity=129/140,
                                  control_sensitivity=38/49,
                                  control_specificity=129/140,
                                  # confounding
                                  rr_uy=1/5.95,
                                  prevu_exposed=0.08365019,
                                  prevu_unexposed=0.5630397,
                                  conf_bias = conf_bias
)
# use of hospital controls
multiple_bias_analysis_pseudodata(study_2x2, 
                                  # selection bias
                                  exposed_case_selectionprob=1, 
                                  unexposed_case_selectionprob=1,
                                  exposed_control_selectionprob=1.0,
                                  unexposed_control_selectionprob=0.5179202,
                                  # exposure misclassification
                                  case_sensitivity=38/49,
                                  case_specificity=129/140,
                                  control_sensitivity=38/49,
                                  control_specificity=129/140,
                                  # confounding
                                  rr_uy=(93/1077)/(624/2400),
                                  prevu_exposed=0.08365019,
                                  prevu_unexposed=0.5630397,
                                  conf_bias = conf_bias
)

# Abnet et al sensitivity and specificity
multiple_bias_analysis_pseudodata(study_2x2, 
                                  # selection bias
                                  exposed_case_selectionprob=1, 
                                  unexposed_case_selectionprob=1,
                                  exposed_control_selectionprob=0.9283779,
                                  unexposed_control_selectionprob=0.5179202,
                                  # exposure misclassification
                                  case_sensitivity=70/78,
                                  case_specificity=67/72,
                                  control_sensitivity=70/78,
                                  control_specificity=67/72,
                                  # confounding
                                  rr_uy=(93/1077)/(624/2400),
                                  prevu_exposed=0.08365019,
                                  prevu_unexposed=0.5630397,
                                  conf_bias = conf_bias
)

############################################################-
# probabilistic bias analysis  ----
# included here to give the general strategy
############################################################-

# function: bootstrap_tabled_data ----
# special function to bootstrap sample tabular data
bootstrap_tabled_data <- function(data){
  resl = lapply(1:nrow(data), function(x) do.call("rbind", lapply(1:data[x,"n"], function(y) data[x,,drop=FALSE])))
  res = do.call(rbind, resl)
  bootid = sample(1:nrow(res), replace=TRUE)
  newdat = res[bootid,]  %>%
    select(-n)
  tabdat = as.data.frame(table(newdat), responseName="n") %>%
    mutate(
      case=as.numeric(as.character(case)),
      exposed=as.numeric(as.character(exposed))
    )
  tabdat
}

# special function to get bootstrap variance estimates for measured confounding bias
# used in probabilistic bias analysis
bootstrap_conf_bias <- function(data2x2x2){
  testfn <- function(i){
    bootdata = bootstrap_tabled_data(data2x2x2)
    (rr_apparent <- exp(glm(case~exposed, weight=n, data = bootdata, family = binomial(link="logit"))$coefficients[2]))
    (rr_smkadjusted <- exp(glm(case~exposed+smoker, weight=n, data = bootdata, family = binomial(link="logit"))$coefficients[2]))
    conf_bias =  rr_apparent / rr_smkadjusted # estimate of bias due to confounding by smoking
    c(rr_apparent, rr_smkadjusted, conf_bias )
  }
  bootsamps = sapply(1:1000, testfn)
  sd(log(bootsamps))
}

#sd_confbias = bootstrap_conf_bias(study_2x2x2) # 0.1811161

pba_iteration = function(iter, 
                         data2x2,
                         # selection bias
                         exposed_case_selectionprob=1, 
                         unexposed_case_selectionprob=1,
                         exposed_control_selectionprob=0.9283779,
                         unexposed_control_selectionprob=0.5179202,
                         # exposure misclassification
                         case_sensitivity=38/49,
                         case_specificity=129/140,
                         control_sensitivity=38/49,   # set to null if you want this to exactly equal case value
                         control_specificity=129/140, # set to null if you want this to exactly equal case value
                         # confounding
                         rr_uy=(93/1077)/(624/2400),
                         prevu_exposed=0.08365019,
                         prevu_unexposed=0.5630397,
                         conf_bias = conf_bias,
                         verbose=FALSE
                         ){
  if(verbose){
    # this will print out the sensitivity parameters
    print(sapply(as.list(match.call())[6:15], eval, simplify=TRUE))
  }
  if(is.null(control_sensitivity)){
    control_sensitivity=case_sensitivity
  }
  if(is.null(control_specificity)){
    control_specificity=case_specificity
  }
  
  
   if(iter >0) {
     # bootstrap sampling from case control study: case/control ns are fixed
     bootdata <- bootstrap_tabled_data(data2x2)
     } else bootdata = data2x2
  
     res = tryCatch(
       multiple_bias_analysis_pseudodata(bootdata, 
                                         # selection bias
                                         exposed_case_selectionprob=exposed_case_selectionprob, 
                                         unexposed_case_selectionprob=unexposed_case_selectionprob,
                                         exposed_control_selectionprob=exposed_control_selectionprob,
                                         unexposed_control_selectionprob=unexposed_control_selectionprob,
                                         # exposure misclassification
                                         case_sensitivity=case_sensitivity,
                                         case_specificity=case_specificity,
                                         control_sensitivity=control_sensitivity,
                                         control_specificity=control_specificity,
                                         # confounding
                                         rr_uy=rr_uy,
                                         prevu_exposed=prevu_exposed,
                                         prevu_unexposed=prevu_unexposed,
                                         conf_bias = conf_bias
       )$estimate,
       error = function(e) NA,
        finally=NULL
     )
     res
}


# probabilistic bias analysis no bias ----
# parametric bootstrap on confounding bias
set.seed(12312)
rr_samples_nobias = sapply(0:1000, function(i) pba_iteration(iter=i, data2x2=study_2x2, 
                                              exposed_case_selectionprob=1, 
                                              unexposed_case_selectionprob=1,
                                              exposed_control_selectionprob=1,
                                              unexposed_control_selectionprob=1,
                                              # exposure misclassification
                                              case_sensitivity=1,
                                              case_specificity=1,
                                              control_sensitivity=1,
                                              control_specificity=1,
                                              # confounding
                                              rr_uy=1,
                                              prevu_exposed=1,
                                              prevu_unexposed=1,
                                              conf_bias = ifelse(i==0, conf_bias, rlnorm(1, log(conf_bias), 0.1811161))
                                              ), simplify=TRUE)
rr_samples_nobias[1] # the estimate on the original data
#[1] 1.251411
quantile(rr_samples_nobias[-1], c(.5, .025, .975), na.rm=TRUE) # boostrap intervals
# 50%      2.5%     97.5% 
# 1.2494500 0.7102496 2.4609188 


# probabilistic bias analysis using distributionalized versions of original bias parameters----

# choice of prior distributions ----
# selection bias
# misclassification
  # sensitivity and specificity: draws from beta distribution parameterized as
  # B(a,b), where 'a' is the numer of 'successes' and 'b' is the number of 'failures'
  # These can be directly taken from the validation data
  # Sensitivity = B(true positives, false negatives)     ['true positive rate']
  # Specificity = 1 - B(false positives, true negatives) [parameterized as 'false positive rate']
# unmeasured confounding
  # risk ratio: impact of u on y, can use crude log-or as 'prior' for log-rr
Hadji_rawdat = textConnection(
  "case,exposed,n
1,1,93
1,0,1077
0,1,624
0,0,2400
", open = "r")
Hadji_dat = read.csv(Hadji_rawdat)
close.connection(Hadji_rawdat)
log_or_prior = summary(glm(case~exposed, weight=n, family=binomial(), data = Hadji_dat))$coefficients[2,]

# 10000 draws from prior distribution
hist(exp(rnorm(10000, log_or_prior["Estimate"], log_or_prior["Std. Error"])))
  
   # prevalence of U among the exposed: Moradinazar study, again using beta distribution
   # total number exposed: 15600
   # estimated proportion female: 0.08365019
   nfe = 0.08365019*15600 # expected number of females among exposed
   nme = (1-0.08365019)*15600 # expected number of males among exposed
   hist(rbeta(10000, nfe, nme))
   # prevalence of U among the unexposed
   # estimated proportion female: 0.5630397
   nfu = 0.5630397*(130570-15600) # expected number of females among unexposed
   nmu = (1-0.5630397)*(130570-15600) # expected number of males among unexposed
   hist(rbeta(10000, nfu, nmu))
   

# forcing non-differentiality of cases and controls
set.seed(12312)
rr_samples1 = sapply(1:1000, function(i) pba_iteration(iter=i, data2x2=study_2x2,
                                              exposed_case_selectionprob=1, 
                                              unexposed_case_selectionprob=1,
                                              exposed_control_selectionprob=rbeta(1,93,8),
                                              unexposed_control_selectionprob=rbeta(1,52, 48), 
                                              case_sensitivity=rbeta(1,38,49-38),
                                              case_specificity=1-rbeta(1,140-129,129),
                                              control_sensitivity=NULL,
                                              control_specificity=NULL,
                                              rr_uy=exp(rnorm(1, log_or_prior["Estimate"], log_or_prior["Std. Error"])),
                                              prevu_exposed=rbeta(1, 0.08365019*15600, (1-0.08365019)*15600),
                                              prevu_unexposed=rbeta(1, 0.5630397*(130570-15600), (1-0.5630397)*(130570-15600)),
                                              conf_bias = rlnorm(1, log(conf_bias), 0.1811161),
                                            verbose=FALSE
), simplify=TRUE)

quantile(rr_samples1, c(.5, .025, .975), na.rm=TRUE) # boostrap intervals
# 50%      2.5%     97.5% 
# 4.815092  1.195824 69.046440 
mean(is.na(rr_samples1))# proportion of samples with implausible values
#[1] 0.324


# allowing cases and controls to differ in sens/spec, but being similar on average
set.seed(12312)
rr_samples2 = sapply(1:1000, function(i) pba_iteration(iter=i, data2x2=study_2x2,
                                               exposed_case_selectionprob=1, 
                                               unexposed_case_selectionprob=1,
                                               exposed_control_selectionprob=rbeta(1,93,8),
                                               unexposed_control_selectionprob=rbeta(1,52, 48), 
                                               case_sensitivity=rbeta(1,38,49-38),
                                               case_specificity=1-rbeta(1,140-129,129),
                                               control_sensitivity=rbeta(1,38,49-38),
                                               control_specificity=1-rbeta(1,140-129,140),
                                               rr_uy=exp(rnorm(1, log_or_prior["Estimate"], log_or_prior["Std. Error"])),
                                               prevu_exposed=rbeta(1, 0.08365019*15600, (1-0.08365019)*15600),
                                               prevu_unexposed=rbeta(1, 0.5630397*(130570-15600), (1-0.5630397)*(130570-15600)),
                                               conf_bias = rlnorm(1, log(conf_bias), 0.1811161),
                                               verbose=FALSE
), simplify=TRUE)

quantile(rr_samples2, c(.5, .025, .975), na.rm=TRUE) # boostrap intervals
#50%      2.5%     97.5% 
#4.075963  1.185807 65.023321 
mean(is.na(rr_samples2))# proportion of samples with implausible values
# 0.225

# alternative sens/spec estimatates from abnet
#case_sensitivity=70/78,
#case_specificity=67/72,
set.seed(12312)
rr_samples3 = sapply(1:1000, function(i) pba_iteration(iter=i, data2x2=study_2x2,
                                               exposed_case_selectionprob=1, 
                                               unexposed_case_selectionprob=1,
                                               exposed_control_selectionprob=rbeta(1,93,8),
                                               unexposed_control_selectionprob=rbeta(1,52, 48), 
                                               case_sensitivity=rbeta(1,70,78-70),
                                               case_specificity=1-rbeta(1,72-67,67),
                                               control_sensitivity=case_sensitivity,
                                               control_specificity=case_specificity,
                                               rr_uy=exp(rnorm(1, log_or_prior["Estimate"], log_or_prior["Std. Error"])),
                                               prevu_exposed=rbeta(1, 0.08365019*15600, (1-0.08365019)*15600),
                                               prevu_unexposed=rbeta(1, 0.5630397*(130570-15600), (1-0.5630397)*(130570-15600)),
                                               conf_bias = rlnorm(1, log(conf_bias), 0.1811161),
                                               verbose=FALSE
), simplify=TRUE)

quantile(rr_samples3, c(.5, .025, .975), na.rm=TRUE) # boostrap intervals
mean(is.na(rr_samples3))# proportion of samples with implausible values