|
| 1 | +library(ggplot2) |
| 2 | +library(MASS) |
| 3 | +library(ggfortify) |
| 4 | +library(gridExtra) |
| 5 | +library(markdown) |
| 6 | + |
| 7 | +#' @param n number of observations |
| 8 | +#' @param a intercept |
| 9 | +#' @param b_x effect of continuous variable |
| 10 | +#' @param b_fac effect of categorial variable |
| 11 | +#' @param b_int interaction effect |
| 12 | +#' @param link link function |
| 13 | +#' @param family error distribution function |
| 14 | +datagen <- function(n = 100, |
| 15 | + a = 0, |
| 16 | + b_x = 2, |
| 17 | + b_fac = -1, |
| 18 | + b_int = -3, |
| 19 | + link = c('identity', 'log'), |
| 20 | + family = c('gaussian', 'poisson', 'negbin'), |
| 21 | + sigma = 1, |
| 22 | + dispersion = 4) { |
| 23 | + link <- match.arg(link) |
| 24 | + family <- match.arg(family) |
| 25 | + |
| 26 | + x <- runif(n) |
| 27 | + fac <- sample(c('A', 'B'), n, replace = TRUE) |
| 28 | + fac_dummy <- ifelse(fac == 'A', 0, 1) |
| 29 | + |
| 30 | + # mean |
| 31 | + link_mu <- a + b_x*x + b_fac*fac_dummy + b_int*fac_dummy*x |
| 32 | + mu <- switch(link, |
| 33 | + identity = link_mu, |
| 34 | + log = exp(link_mu)) |
| 35 | + if (family %in% c('poisson', 'negbin') && any(mu < 0)) |
| 36 | + stop("Cannot simulate Poisson or NegBin with negative mean. Maybe change link function.") |
| 37 | + # response |
| 38 | + y <- switch(family, |
| 39 | + poisson = rpois(n, mu), |
| 40 | + gaussian = rnorm(n, mean = mu, sd = sigma), |
| 41 | + negbin = rnbinom(n, mu = mu, size = 1/dispersion)) |
| 42 | + |
| 43 | + # return |
| 44 | + df <- data.frame(x, y, fac) |
| 45 | + return(df) |
| 46 | +} |
| 47 | + |
| 48 | +#' @param df data.frame as returned by datagen |
| 49 | +#' @param link link function |
| 50 | +#' @param family error distribution function |
| 51 | +datamodel <- function(df, |
| 52 | + family = c('gaussian', 'poisson', 'negbin'), |
| 53 | + link = c('identity', 'log'), |
| 54 | + terms = c('intercept', 'x', 'fac', 'both', 'interaction')){ |
| 55 | + link <- match.arg(link) |
| 56 | + family <- match.arg(family) |
| 57 | + terms <- match.arg(terms) |
| 58 | + form <- switch(terms, |
| 59 | + intercept = as.formula(y ~ 1), |
| 60 | + x = as.formula(y ~ x), |
| 61 | + fac = as.formula(y ~ fac), |
| 62 | + both = as.formula(y ~ x+fac), |
| 63 | + interaction = as.formula(y ~ x*fac) |
| 64 | + ) |
| 65 | + start <- switch(terms, |
| 66 | + intercept = 1, |
| 67 | + x = rep(1, 2), |
| 68 | + fac = rep(1, 2), |
| 69 | + both = rep(1, 3), |
| 70 | + interaction = rep(1, 4) |
| 71 | + ) |
| 72 | + |
| 73 | + mod <- switch(family, |
| 74 | + poisson = glm(form, data = df, family = poisson(link = link), |
| 75 | + start = start), |
| 76 | + gaussian = glm(form, data = df, family = gaussian(link = link), |
| 77 | + start = start), |
| 78 | + negbin = glm.nb(form, data = df), link = link) |
| 79 | + return(mod) |
| 80 | +} |
| 81 | + |
| 82 | +#' @param df data.frame as returned by datagen |
| 83 | +#' @param mod model as returned by datamodel |
| 84 | +dataplot <- function(df, mod = NULL) { |
| 85 | + lim <- c(-10, 10) |
| 86 | + # model fit + ci |
| 87 | + pdat <- expand.grid(x = seq(min(df$x), max(df$x), |
| 88 | + length.out = 100), |
| 89 | + fac = levels(df$fac)) |
| 90 | + pdat$fit <- predict(mod, newdata = pdat, type = "link") |
| 91 | + pdat$se <- predict(mod, newdata = pdat, type = "link", se.fit = TRUE)$se.fit |
| 92 | + mod_fam <- mod$family$family |
| 93 | + mod_fam <- ifelse(grepl('Negative Binomial', mod_fam), 'negbin', mod_fam) |
| 94 | + crit <- switch(mod_fam, |
| 95 | + gaussian = qt(0.975, df = mod$df.residual), |
| 96 | + poisson = qnorm(0.975), |
| 97 | + negbin = qt(0.975, df = mod$df.residual)) |
| 98 | + pdat$lwr <- pdat$fit - crit * pdat$se |
| 99 | + pdat$upr <- pdat$fit + crit * pdat$se |
| 100 | + pdat$fit_r <- mod$family$linkinv(pdat$fit) |
| 101 | + pdat$lwr_r <- mod$family$linkinv(pdat$lwr) |
| 102 | + pdat$upr_r <- mod$family$linkinv(pdat$upr) |
| 103 | + |
| 104 | + # simulate from model for PI |
| 105 | + nsim <- 1000 |
| 106 | + y_sim <- simulate(mod, nsim = nsim) |
| 107 | + y_sim_minmax <- apply(y_sim, 1, quantile, probs = c(0.05, 0.95)) |
| 108 | + simdat <- data.frame(ysim_min = y_sim_minmax[1, ], |
| 109 | + ysim_max = y_sim_minmax[2, ], |
| 110 | + x = df$x, |
| 111 | + fac = df$fac) |
| 112 | + p <- ggplot() + |
| 113 | + geom_ribbon(data = simdat, aes(x = x, |
| 114 | + ymax = ysim_max, ymin = ysim_min, |
| 115 | + fill = fac), alpha = 0.2) + |
| 116 | + geom_line(data = pdat, aes(x = x, y = fit_r, col = fac)) + |
| 117 | + geom_line(data = pdat, aes(x = x, y = upr_r, col = fac), |
| 118 | + linetype = 'dashed') + |
| 119 | + geom_line(data = pdat, aes(x = x, y = lwr_r, col = fac), |
| 120 | + linetype = 'dashed') + |
| 121 | + geom_point(data = df, aes(x = x, y = y, color = fac)) + |
| 122 | + labs(y = 'y') + |
| 123 | + ylim(lim) |
| 124 | + p |
| 125 | +} |
| 126 | + |
| 127 | +coefplot <- function(a = 2, |
| 128 | + b_x = 1, |
| 129 | + b_fac = 1, |
| 130 | + b_int = -2, |
| 131 | + mod) { |
| 132 | + coefs <- coef(mod) |
| 133 | + se <- diag(vcov(mod))^0.5 |
| 134 | + terms <- c('a', 'b_x', 'b_fac', 'b_int') |
| 135 | + terms <- terms[seq_along(coefs)] |
| 136 | + truths <- c(a, b_x, b_fac, b_int) |
| 137 | + truths <- truths[seq_along(coefs)] |
| 138 | + df <- data.frame(term = terms, estimate = coefs, se, truths) |
| 139 | + mod_fam <- mod$family$family |
| 140 | + mod_fam <- ifelse(grepl('Negative Binomial', mod_fam), 'negbin', mod_fam) |
| 141 | + crit <- switch(mod_fam, |
| 142 | + gaussian = qt(0.975, df = mod$df.residual), |
| 143 | + poisson = qnorm(0.975), |
| 144 | + negbin = qt(0.975, df = mod$df.residual)) |
| 145 | + df$lwr <- df$estimate - crit * df$se |
| 146 | + df$upr <- df$estimate + crit * df$se |
| 147 | + |
| 148 | + p <- ggplot(df, aes(x = term)) + |
| 149 | + geom_pointrange(aes(y = estimate, ymax = upr, ymin = lwr)) + |
| 150 | + geom_point(aes(y = truths), col = 'red') + |
| 151 | + geom_vline(xintercept = 0, linetype = 'dashed') + |
| 152 | + coord_flip() |
| 153 | + p |
| 154 | +} |
| 155 | + |
| 156 | +diagplot <- function(df, mod) { |
| 157 | + rfdat <- data.frame(res = residuals(mod, type = 'pearson'), |
| 158 | + fac = df$fac, |
| 159 | + fit = predict(mod, type = 'response')) |
| 160 | + p1 <- ggplot() + |
| 161 | + geom_point(data = rfdat, aes(x = fit, y = res, color = fac)) + |
| 162 | + ggtitle('Residuals vs. Fitted') + |
| 163 | + labs(x = 'Residuals', y = 'Fitted') + |
| 164 | + geom_smooth(data = rfdat, aes(x = fit, y = res, color = fac), |
| 165 | + se = FALSE) + |
| 166 | + geom_smooth(data = rfdat, aes(x = fit, y = res), color = 'blue', |
| 167 | + se = FALSE) + |
| 168 | + geom_abline(aes(intercept = 0, slope = 0), linetype = 'dashed') |
| 169 | + |
| 170 | + |
| 171 | + ofdat <- data.frame(obs = df$y, |
| 172 | + fac = df$fac, |
| 173 | + fit = predict(mod, type = 'response')) |
| 174 | + p2 <- ggplot() + |
| 175 | + geom_point(data = ofdat, aes(x = obs, y = fit, color = fac)) + |
| 176 | + ggtitle('Fitted vs Observed') + |
| 177 | + labs(x = 'Observed', y = 'Fitted') + |
| 178 | + geom_abline(aes(intercept = 0, slope = 1), linetype = 'dashed') |
| 179 | + plot.list <- list(p1, p2) |
| 180 | + new("ggmultiplot", plots = plot.list, nrow = 1, ncol = 2) |
| 181 | + } |
| 182 | + |
| 183 | + |
| 184 | +chk_pos <- function(y, fam) { |
| 185 | + if (fam %in% c('poisson', 'negbin') && any(y < 0)) { |
| 186 | + "Negative values in data. Cannot fit Poisson or NegBin." |
| 187 | + } else { |
| 188 | + NULL |
| 189 | + } |
| 190 | +} |
0 commit comments