Motivation

The purpose of this document is to identify what impact, if any, spatial correlation within and between built environment features may have on the ability of the rstap package estimating procedure to correctly estimate parameters. We’ll begin by loading in the appropriate libraries, including, the rstap package for modeling these kinds of data.

library(rstap)
library(dplyr)
library(tidyr)
library(spatstat)
library(ggplot2)

More advanced

To induce correlation between business locations, we generate the x and y coordinates from an inhomogenous poisson process, with intensity function \(f(x,y) = 200xy\). So the mean number of businesses increases as \(x\) and \(y\) increase.

intenfun <- function(x,y) 10*abs(1-(abs(x)^2 + abs(y)^2))
nclust <- function(x0, y0, radius = .1, n) {
              return(runifdisc(n, radius, centre=c(x0, y0)))
            }
x <- seq(from = -1, to = 1, by = 0.01)
y <- seq(from = -1, to = 1, by = 0.01)
xy <- expand.grid(x=x,y=y)
z <- intenfun(xy$x,xy$y)
cbind(x=xy$x,y=xy$y,mean = z) %>% as_data_frame() %>% 
    mutate(cut_y = cut(y,4)) %>% 
    ggplot(aes(x=x,y=y,color=z)) + geom_point() + scale_color_continuous(low='white',high='red') +
        theme_bw() + ggtitle("Intensity Function: Mean of Poisson Across 2D Space")

We continue to generate subjects uniformly across the space. The following plot shows where all subjects and built environment features are located in our artificial space.

set.seed(2524)
bef_rpp <- spatstat::rPoissonCluster(kappa = intenfun, expand = 4, lmax = 100,
                                     rcluster = nclust, radius = .2, n = 2, win = owin(c(-1,1),c(-1,1)))
bef_df <- data_frame(x = bef_rpp$x,
                     y = bef_rpp$y, class = 'BEF')
intenfun <- function(x,y) 250*abs(1-(abs(x)^2 + abs(y)^2))
sub_rpp <- spatstat::rpoispp(intenfun, win = owin(c(-1,1),c(-1,1)), lmax = 500)
sub_df <- data_frame(x = sub_rpp$x,
                     y = sub_rpp$y,
                     class = "Subject")
rbind(bef_df,sub_df) %>% ggplot(aes(x=x,y=y,color=class)) + geom_point() + 
    theme_bw() + ggtitle("Correlated Business Positions")

Then suppose the form of some measurement on the subjects is:

\[ E[Y_i] = \alpha + Z_i \delta + X_i(\theta)\beta_2\\ X_i(\theta) = \sum_{d \in \mathcal{D}_i} \text{w}(\frac{d}{\theta}) \]

Using the following fixed parameter estimates.

alpha <- 22.5
Z <- rbinom(n = nrow(sub_df), prob = .45, size = 1)
delta <- -.8
theta <- .3
beta <- 1.2
sigma <- 2.3

Assuming the exposure is imposed across all current features, then this results in the following dataset and marginal distribution of the outcome.

dists <- fields::rdist(as.matrix(sub_df[,1:2]),
                       as.matrix(bef_df[,1:2]))
X <- apply(dists,1,function(x) sum(pracma::erfc(x/theta)))
X_tilde <- (X-mean(X))/sd(X) ## center and scale the exposure effect
y <- alpha + Z*delta + X_tilde*beta + rnorm(n = nrow(sub_df), mean = 0, sd = sigma)
par(mfrow=c(1,2))
hist(y)
hist(X, main = "Distribution of 'Exposure' to BEF")

Additionally, this results in the following decay function and marginal distribution of “Exposure”.

d <- seq(from = 0, to = max(dists), by = 0.01)
X_theta_one <- pracma::erfc(d/theta)
par(mfrow = c(1,2))
plot(d,X_theta_one,type='l',main = "Exposure Decay Function", xlab = "Distance",
     ylab = "Exposure")
hist(dists)

Modeling these data with rstap…

distance_data <- dists %>% as_data_frame() %>%
    mutate(subj_id = 1:nrow(sub_df)) %>% 
    gather(contains("V"), key = 'BEF',value = 'Distance') %>% 
    mutate(BEF = 'Fast_Food')

subject_data <- data_frame(subj_id = 1:nrow(sub_df),
                           y = y,
                           sex = factor(Z,labels=c("M","F")))

fit <- stap_glm(formula = y ~ sex + sap(Fast_Food),
                subject_data = subject_data,
                distance_data = distance_data,
                family = gaussian(link = 'identity'),
                id_key = 'subj_id',
                prior = normal(location = 0, scale = 5, autoscale = F),
                prior_intercept = normal(location = 25, scale = 5, autoscale = F),
                prior_stap = normal(location = 0, scale = 3, autoscale = F),
                prior_theta = log_normal(location = 0, scale = 1), 
                prior_aux = cauchy(location = 0,scale = 5),
                max_distance = max(dists), chains = 2, cores = 2) ## include all data
results <- rbind(c(alpha,delta,beta,theta),coef(fit))
rownames(results) <- c("True","Estimated")
results
se(fit)