Getting the data ready

#Look at deer data and spatial layers to be used as covariates

  plot(forest.cover,main="Forest Cover")
  points(deer$x_,deer$y_,col=0)

  plot(shrub.cover,main="Standaradized Shrub Cover")
  points(deer,col="white")

Assignment (Part 1)

Conduct a sensitivity analysis to investigate how many available samples are needed for the slope coeficients to be estimated with minimal approximation error. Consider a range of sizes for the available sample between 50 and 500,000. Fit the same model (case_ ~ forest + cover) to each of these datasets. Extract the slope coeficients for the variables ‘forest’ and ‘cover’. Creat an x-y line plot with these estimates (y-axis) and the size of the available sample (x-axis). Determine generally when the coefficient estimates stabilize. A common recommendation in the literature is to used a 1:1 ratio between used locations and available samples. If you had only fit the model with this size of available how off would your estimates be from the correct/converged/well approximated estimates when using a large available sample.

# Define the home range with MCP and buffer by 5000m
  hr <- hr_mcp(deer) |> hr_isopleths() |> sf::st_buffer(dist = 5000)

# Take a really large sample of available locations (the zeros)
  rsf.dat.large <- random_points(hr, n = 4000000, presence = deer) |> 
                   extract_covariates(forest.cover) |>
                   extract_covariates(shrub.cover) 

# Decide on how to subset the available samples
  availables = c(50,100,500,1000,10000,100000,1000000,2000000,3000000,4000000)

# Find the indices of where the deer locations are (index.1)  and the available samples (index.0)
  index.0 = which(rsf.dat.large$case_==0)
  index.1 = which(rsf.dat.large$case_==1)
  coef.mat=NULL

# Loop through the numbers of available samples and iteratively grab an increase set of zeros. Combine
# these zeros with the used locations of deer and then fit the model 
  for(i in 1:length(availables)){
    rsf.smaller =  rbind(rsf.dat.large[index.1,],rsf.dat.large[index.0[1:availables[i]],])
    fit = glm(case_~forest+shrub.cover, data=rsf.smaller,family=binomial(link="logit"))
    coef.mat=rbind(coef.mat,coef(fit)[c(2,3)])
    remove(fit)
  }

  remove(rsf.dat.large)
  
  knitr::kable(cbind(availables,coef.mat))
availables forest shrub.cover
5e+01 4.082826 -1.1408137
1e+02 4.540003 -0.9765100
5e+02 3.587066 -0.6990556
1e+03 3.370832 -0.7103731
1e+04 3.145356 -0.5694584
1e+05 3.158024 -0.5265558
1e+06 3.169989 -0.5271564
2e+06 3.152163 -0.5266072
3e+06 3.151536 -0.5261268
4e+06 3.149747 -0.5259137

Let’s plot these coefficients by the available sample size

plot(availables,coef.mat[,1],lwd=3,xlab="Available Sample Size",ylab="Coeficient",type="b",ylim=c(-1,4))
lines(availables,coef.mat[,2],lwd=3,col=2,type="b")

We can see that the first and second decimal place values converge for both coefficients by 2,000,000 available samples.

Next, lets specifically compare the converged estimated coefficients to estimates when using a 1:1 ratio of used:available.

rsf.dat <- random_points(hr, n = nrow(deer), presence = deer) |> 
             extract_covariates(forest.cover) |>
             extract_covariates(shrub.cover) 

fit = glm(case_~forest+shrub.cover, 
          data=rsf.dat,
          family=binomial(link="logit"))
summary(fit)
## 
## Call:
## glm(formula = case_ ~ forest + shrub.cover, family = binomial(link = "logit"), 
##     data = rsf.dat)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.59500    0.06568   -9.06   <2e-16 ***
## forest       3.09338    0.26379   11.73   <2e-16 ***
## shrub.cover -0.79583    0.06391  -12.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2290.2  on 1651  degrees of freedom
## Residual deviance: 1955.9  on 1649  degrees of freedom
## AIC: 1961.9
## 
## Number of Fisher Scoring iterations: 4
# Absolute difference b/w converged and non-converged estimates
(coef.mat[10,] - coef(fit)[2:3])
##      forest shrub.cover 
##  0.05636538  0.26991436

We can see that if we had used a 1:1 ratio of used to available, we would have underestimated the effect of forest and overestimated the effect of shrub cover.

Assignment (Part 2)

### Setup environment
# Include time of day variable
  deer = time_of_day(deer)
  head(deer)
## # A tibble: 6 × 5
##         x_       y_ t_                  burst_ tod_ 
## *    <dbl>    <dbl> <dttm>               <dbl> <fct>
## 1 4314068. 3445807. 2008-03-30 00:01:47      1 night
## 2 4314053. 3445768. 2008-03-30 06:00:54      1 day  
## 3 4314105. 3445859. 2008-03-30 12:01:47      1 day  
## 4 4314044. 3445785. 2008-03-30 18:01:24      1 night
## 5 4313015. 3445858. 2008-03-31 00:01:23      1 night
## 6 4312860. 3445857. 2008-03-31 06:01:45      1 day

Setup available sample

# Create MCP home rnage with 5000 m bufer
hr <- hr_mcp(deer) |> hr_isopleths() |> 
  sf::st_buffer(dist =5000)


# Draw random available samples
  set.seed(5454)
  rsf1 <- random_points(hr, n=20000, presence = deer) |> 
    extract_covariates(forest.cover)  
  table(rsf1$case_)
## 
## FALSE  TRUE 
## 20000   826
plot(rsf1)

### Create data set with time of day

# find where the used locations are and the available locations
  index.1=which(rsf1$case_==1)
  index.0=which(rsf1$case_==0)

# Create a new column for 'time of day', tod
# Put temporary values
  rsf1$tod="temp"

#For the used locations, put in the observed time of day
  rsf1$tod[index.1] = as.character(deer$tod_)
  
#For the available samples make these all 'day'  
  rsf1$tod[index.0] = "day"

#Now, we need to replicate the available sample but for 'night'
  temp = rsf1[index.0,]
  temp$tod="night"

#Combine the used locations, available sample with 'day', and the available sample with 'night'  
  rsf.combined = rbind(rsf1,temp)

Fit Model

rsf2 = glm(case_~forest*tod, 
           data=rsf.combined,
           family=binomial(link="logit"))
summary(rsf2)
## 
## Call:
## glm(formula = case_ ~ forest * tod, family = binomial(link = "logit"), 
##     data = rsf.combined)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -4.06057    0.05578 -72.797  < 2e-16 ***
## forest           2.89745    0.15362  18.861  < 2e-16 ***
## todnight        -0.49591    0.09046  -5.482 4.21e-08 ***
## forest:todnight  0.05743    0.24807   0.232    0.817    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8078.8  on 40825  degrees of freedom
## Residual deviance: 7603.4  on 40822  degrees of freedom
## AIC: 7611.4
## 
## Number of Fisher Scoring iterations: 7

First, we know the intercept doesn’t mean anything. The effect of forest coefficient is positive and indicates that selection increases with forest cover during the day time (i.e., not night). The todnight covariate indicates selection at a forest cover of zero during the night. Since this coefficient is negative, we know that deer are avoiding these open areas during the night. Both those coefficients are statistically clear. The interaction is not statistically clear, and thus there is no statistically clear difference of selection at night versus during the day for increasing forest cover.

Predictions

# Create dataframes of covariate combinations
  newdata1 = data.frame(forest = seq(0,1,by=0.1), tod="night")
  newdata2 = data.frame(forest = seq(0,1,by=0.1), tod="day")

# Make sure the intercept is zero and not included in the predictions  
  rsf2$coefficients[1] = 0 
  coef(rsf2)
##     (Intercept)          forest        todnight forest:todnight 
##      0.00000000      2.89744563     -0.49590813      0.05743005
# Combine linear terms of covariates and coeficients  
  preds.link.night = predict(rsf2,newdata=newdata1, type="link")
  preds.link.day = predict(rsf2,newdata=newdata2, type="link")

  
# Exponentiate linear terms  
  relative.prob.night = exp(preds.link.night)
  relative.prob.day = exp(preds.link.day)

# Plot relative intensity of seleciton by forest cover during the day and night  
  plot(newdata1$forest,relative.prob.night,type="l",lwd=3,xlab="Forest Cover",ylab="Relative Intensity of Selection",ylim=c(0,12))
  lines(newdata2$forest,relative.prob.day,type="l",lwd=3,col=2)
  abline(h=1,lwd=3,col=3,lty=3)
  legend("topleft",lwd=3,col=c(1,2,3),legend=c("Night","Day"))
  text(0.9,1.4,"Selection")
  text(0.9,0.7,"Avoidance")