#Look at deer data and spatial layers to be used as covariates
plot(forest.cover,main="Forest Cover")
points(deer$x_,deer$y_,col=0)
plot(shrub.cover,main="Standaradized Shrub Cover")
points(deer,col="white")
Conduct a sensitivity analysis to investigate how many available samples are needed for the slope coeficients to be estimated with minimal approximation error. Consider a range of sizes for the available sample between 50 and 500,000. Fit the same model (case_ ~ forest + cover) to each of these datasets. Extract the slope coeficients for the variables ‘forest’ and ‘cover’. Creat an x-y line plot with these estimates (y-axis) and the size of the available sample (x-axis). Determine generally when the coefficient estimates stabilize. A common recommendation in the literature is to used a 1:1 ratio between used locations and available samples. If you had only fit the model with this size of available how off would your estimates be from the correct/converged/well approximated estimates when using a large available sample.
# Define the home range with MCP and buffer by 5000m
hr <- hr_mcp(deer) |> hr_isopleths() |> sf::st_buffer(dist = 5000)
# Take a really large sample of available locations (the zeros)
rsf.dat.large <- random_points(hr, n = 4000000, presence = deer) |>
extract_covariates(forest.cover) |>
extract_covariates(shrub.cover)
# Decide on how to subset the available samples
availables = c(50,100,500,1000,10000,100000,1000000,2000000,3000000,4000000)
# Find the indices of where the deer locations are (index.1) and the available samples (index.0)
index.0 = which(rsf.dat.large$case_==0)
index.1 = which(rsf.dat.large$case_==1)
coef.mat=NULL
# Loop through the numbers of available samples and iteratively grab an increase set of zeros. Combine
# these zeros with the used locations of deer and then fit the model
for(i in 1:length(availables)){
rsf.smaller = rbind(rsf.dat.large[index.1,],rsf.dat.large[index.0[1:availables[i]],])
fit = glm(case_~forest+shrub.cover, data=rsf.smaller,family=binomial(link="logit"))
coef.mat=rbind(coef.mat,coef(fit)[c(2,3)])
remove(fit)
}
remove(rsf.dat.large)
knitr::kable(cbind(availables,coef.mat))
availables | forest | shrub.cover |
---|---|---|
5e+01 | 4.082826 | -1.1408137 |
1e+02 | 4.540003 | -0.9765100 |
5e+02 | 3.587066 | -0.6990556 |
1e+03 | 3.370832 | -0.7103731 |
1e+04 | 3.145356 | -0.5694584 |
1e+05 | 3.158024 | -0.5265558 |
1e+06 | 3.169989 | -0.5271564 |
2e+06 | 3.152163 | -0.5266072 |
3e+06 | 3.151536 | -0.5261268 |
4e+06 | 3.149747 | -0.5259137 |
Let’s plot these coefficients by the available sample size
plot(availables,coef.mat[,1],lwd=3,xlab="Available Sample Size",ylab="Coeficient",type="b",ylim=c(-1,4))
lines(availables,coef.mat[,2],lwd=3,col=2,type="b")
We can see that the first and second decimal place values converge for both coefficients by 2,000,000 available samples.
Next, lets specifically compare the converged estimated coefficients to estimates when using a 1:1 ratio of used:available.
rsf.dat <- random_points(hr, n = nrow(deer), presence = deer) |>
extract_covariates(forest.cover) |>
extract_covariates(shrub.cover)
fit = glm(case_~forest+shrub.cover,
data=rsf.dat,
family=binomial(link="logit"))
summary(fit)
##
## Call:
## glm(formula = case_ ~ forest + shrub.cover, family = binomial(link = "logit"),
## data = rsf.dat)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.59500 0.06568 -9.06 <2e-16 ***
## forest 3.09338 0.26379 11.73 <2e-16 ***
## shrub.cover -0.79583 0.06391 -12.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2290.2 on 1651 degrees of freedom
## Residual deviance: 1955.9 on 1649 degrees of freedom
## AIC: 1961.9
##
## Number of Fisher Scoring iterations: 4
# Absolute difference b/w converged and non-converged estimates
(coef.mat[10,] - coef(fit)[2:3])
## forest shrub.cover
## 0.05636538 0.26991436
We can see that if we had used a 1:1 ratio of used to available, we would have underestimated the effect of forest and overestimated the effect of shrub cover.
### Setup environment
# Include time of day variable
deer = time_of_day(deer)
head(deer)
## # A tibble: 6 × 5
## x_ y_ t_ burst_ tod_
## * <dbl> <dbl> <dttm> <dbl> <fct>
## 1 4314068. 3445807. 2008-03-30 00:01:47 1 night
## 2 4314053. 3445768. 2008-03-30 06:00:54 1 day
## 3 4314105. 3445859. 2008-03-30 12:01:47 1 day
## 4 4314044. 3445785. 2008-03-30 18:01:24 1 night
## 5 4313015. 3445858. 2008-03-31 00:01:23 1 night
## 6 4312860. 3445857. 2008-03-31 06:01:45 1 day
# Create MCP home rnage with 5000 m bufer
hr <- hr_mcp(deer) |> hr_isopleths() |>
sf::st_buffer(dist =5000)
# Draw random available samples
set.seed(5454)
rsf1 <- random_points(hr, n=20000, presence = deer) |>
extract_covariates(forest.cover)
table(rsf1$case_)
##
## FALSE TRUE
## 20000 826
plot(rsf1)
### Create data set with time of day
# find where the used locations are and the available locations
index.1=which(rsf1$case_==1)
index.0=which(rsf1$case_==0)
# Create a new column for 'time of day', tod
# Put temporary values
rsf1$tod="temp"
#For the used locations, put in the observed time of day
rsf1$tod[index.1] = as.character(deer$tod_)
#For the available samples make these all 'day'
rsf1$tod[index.0] = "day"
#Now, we need to replicate the available sample but for 'night'
temp = rsf1[index.0,]
temp$tod="night"
#Combine the used locations, available sample with 'day', and the available sample with 'night'
rsf.combined = rbind(rsf1,temp)
rsf2 = glm(case_~forest*tod,
data=rsf.combined,
family=binomial(link="logit"))
summary(rsf2)
##
## Call:
## glm(formula = case_ ~ forest * tod, family = binomial(link = "logit"),
## data = rsf.combined)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.06057 0.05578 -72.797 < 2e-16 ***
## forest 2.89745 0.15362 18.861 < 2e-16 ***
## todnight -0.49591 0.09046 -5.482 4.21e-08 ***
## forest:todnight 0.05743 0.24807 0.232 0.817
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8078.8 on 40825 degrees of freedom
## Residual deviance: 7603.4 on 40822 degrees of freedom
## AIC: 7611.4
##
## Number of Fisher Scoring iterations: 7
First, we know the intercept doesn’t mean anything. The effect of
forest coefficient is positive and indicates that selection increases
with forest cover during the day time (i.e., not night). The
todnight
covariate indicates selection at a forest cover of
zero during the night. Since this coefficient is negative, we know that
deer are avoiding these open areas during the night. Both those
coefficients are statistically clear. The interaction is not
statistically clear, and thus there is no statistically clear difference
of selection at night versus during the day for increasing forest
cover.
# Create dataframes of covariate combinations
newdata1 = data.frame(forest = seq(0,1,by=0.1), tod="night")
newdata2 = data.frame(forest = seq(0,1,by=0.1), tod="day")
# Make sure the intercept is zero and not included in the predictions
rsf2$coefficients[1] = 0
coef(rsf2)
## (Intercept) forest todnight forest:todnight
## 0.00000000 2.89744563 -0.49590813 0.05743005
# Combine linear terms of covariates and coeficients
preds.link.night = predict(rsf2,newdata=newdata1, type="link")
preds.link.day = predict(rsf2,newdata=newdata2, type="link")
# Exponentiate linear terms
relative.prob.night = exp(preds.link.night)
relative.prob.day = exp(preds.link.day)
# Plot relative intensity of seleciton by forest cover during the day and night
plot(newdata1$forest,relative.prob.night,type="l",lwd=3,xlab="Forest Cover",ylab="Relative Intensity of Selection",ylim=c(0,12))
lines(newdata2$forest,relative.prob.day,type="l",lwd=3,col=2)
abline(h=1,lwd=3,col=3,lty=3)
legend("topleft",lwd=3,col=c(1,2,3),legend=c("Night","Day"))
text(0.9,1.4,"Selection")
text(0.9,0.7,"Avoidance")