library("lme4")
library("bbmle")
library("MuMIn")
library("dplyr")
library("readr")
library("ggplot2")
library("DHARMa")
library("car")
library("MASS")
library("ROCR")
library("sjPlot")
library("ggeffects")
library("sjstats")
library("cowplot")
library("gstat")
library("purrr")
library("broom.mixed")
BiEc6_N Modellgüte und -diagnostics MM
Libraries laden
Packages die wir für die Modelle und die Diagnostics brauchen
Ausgangslage
- Der Modellfit aus Aufgabe 5 von letzter Woche dient als Ausgangspunkt für die heutigen Übungen.
<- read_delim("datasets/fallstudie_n/Aufgabe4_Datensatz_Habitatnutzung_Modelle_231027_moodle.csv", delim = ";") |>
DF_mod_day filter(time_of_day == "day") |>
mutate(
slope_scaled = scale(slope),
topo_pos_scaled = scale(topo_pos),
us_scaled = scale(us_2014),
os_scaled = scale(os_2014),
forest_prop_scaled = scale(forest_prop),
dist_road_all_scaled = scale(dist_road_all),
dist_road_only_scaled = scale(dist_road_only),
dist_sett_scaled = scale(dist_sett),
remoteness_scaled = scale(remoteness),
id = as.factor(id)
)
<- pres_abs ~
f +
slope_scaled +
us_scaled +
os_scaled +
forest_prop_scaled +
dist_road_only_scaled +
dist_sett_scaled
remoteness_scaled
<- paste(c(f, "+ (1 | id)"), collapse = " ") |> as.formula()
f
<- glmer(f, data = DF_mod_day, family = binomial, na.action = "na.fail")
m_day
<- dredge(m_day)
all_m
<- model.avg(all_m, rank = "AICc", subset = delta < 2)
avgmodel summary(avgmodel)
##
## Call:
## model.avg(object = get.models(object = all_m, subset = delta <
## 2), rank = "AICc")
##
## Component model call:
## glmer(formula = pres_abs ~ <4 unique rhs>, data = DF_mod_day, family =
## binomial, na.action = na.fail)
##
## Component models:
## df logLik AICc delta weight
## 1357 6 -2261.70 4535.41 0.00 0.39
## 12357 7 -2261.02 4536.07 0.65 0.28
## 13567 7 -2261.52 4537.06 1.65 0.17
## 13457 7 -2261.60 4537.23 1.82 0.16
##
## Term codes:
## dist_road_only_scaled dist_sett_scaled forest_prop_scaled
## 1 2 3
## os_scaled remoteness_scaled slope_scaled
## 4 5 6
## us_scaled
## 7
##
## Model-averaged coefficients:
## (full average)
## Estimate Std. Error Adjusted SE z value Pr(>|z|)
## (Intercept) -0.437730 0.148042 0.148087 2.956 0.00312 **
## dist_road_only_scaled 0.510519 0.052598 0.052614 9.703 < 2e-16 ***
## forest_prop_scaled 0.844263 0.058809 0.058826 14.352 < 2e-16 ***
## remoteness_scaled -0.255199 0.056758 0.056775 4.495 7e-06 ***
## us_scaled 0.382072 0.040946 0.040959 9.328 < 2e-16 ***
## dist_sett_scaled -0.020550 0.046737 0.046744 0.440 0.66021
## slope_scaled -0.006037 0.027844 0.027851 0.217 0.82839
## os_scaled 0.004232 0.026310 0.026317 0.161 0.87225
##
## (conditional average)
## Estimate Std. Error Adjusted SE z value Pr(>|z|)
## (Intercept) -0.43773 0.14804 0.14809 2.956 0.00312 **
## dist_road_only_scaled 0.51052 0.05260 0.05261 9.703 < 2e-16 ***
## forest_prop_scaled 0.84426 0.05881 0.05883 14.352 < 2e-16 ***
## remoteness_scaled -0.25520 0.05676 0.05678 4.495 7e-06 ***
## us_scaled 0.38207 0.04095 0.04096 9.328 < 2e-16 ***
## dist_sett_scaled -0.07295 0.06270 0.06272 1.163 0.24479
## slope_scaled -0.03532 0.05918 0.05919 0.597 0.55069
## os_scaled 0.02690 0.06157 0.06159 0.437 0.66226
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
- Die Modellresultate aus dem avgmodel sind grundätzlich die finalen Resultate die bereits interpretiert werden könnten. Allerdings funktionieren die Diagnosetests und die Darstellung der Resultate mit diesem gemittelten Modell nicht sehr gut, weshalb wir einen re-fit mit glmer machen müssen (an den Resultaten ändert sich dadurch nichts)
# hier zum Vergleich, dass die Resulate sich nur marginal verändern
summary(avgmodel)
##
## Call:
## model.avg(object = get.models(object = all_m, subset = delta <
## 2), rank = "AICc")
##
## Component model call:
## glmer(formula = pres_abs ~ <4 unique rhs>, data = DF_mod_day, family =
## binomial, na.action = na.fail)
##
## Component models:
## df logLik AICc delta weight
## 1357 6 -2261.70 4535.41 0.00 0.39
## 12357 7 -2261.02 4536.07 0.65 0.28
## 13567 7 -2261.52 4537.06 1.65 0.17
## 13457 7 -2261.60 4537.23 1.82 0.16
##
## Term codes:
## dist_road_only_scaled dist_sett_scaled forest_prop_scaled
## 1 2 3
## os_scaled remoteness_scaled slope_scaled
## 4 5 6
## us_scaled
## 7
##
## Model-averaged coefficients:
## (full average)
## Estimate Std. Error Adjusted SE z value Pr(>|z|)
## (Intercept) -0.437730 0.148042 0.148087 2.956 0.00312 **
## dist_road_only_scaled 0.510519 0.052598 0.052614 9.703 < 2e-16 ***
## forest_prop_scaled 0.844263 0.058809 0.058826 14.352 < 2e-16 ***
## remoteness_scaled -0.255199 0.056758 0.056775 4.495 7e-06 ***
## us_scaled 0.382072 0.040946 0.040959 9.328 < 2e-16 ***
## dist_sett_scaled -0.020550 0.046737 0.046744 0.440 0.66021
## slope_scaled -0.006037 0.027844 0.027851 0.217 0.82839
## os_scaled 0.004232 0.026310 0.026317 0.161 0.87225
##
## (conditional average)
## Estimate Std. Error Adjusted SE z value Pr(>|z|)
## (Intercept) -0.43773 0.14804 0.14809 2.956 0.00312 **
## dist_road_only_scaled 0.51052 0.05260 0.05261 9.703 < 2e-16 ***
## forest_prop_scaled 0.84426 0.05881 0.05883 14.352 < 2e-16 ***
## remoteness_scaled -0.25520 0.05676 0.05678 4.495 7e-06 ***
## us_scaled 0.38207 0.04095 0.04096 9.328 < 2e-16 ***
## dist_sett_scaled -0.07295 0.06270 0.06272 1.163 0.24479
## slope_scaled -0.03532 0.05918 0.05919 0.597 0.55069
## os_scaled 0.02690 0.06157 0.06159 0.437 0.66226
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m_day)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## pres_abs ~ slope_scaled + us_scaled + os_scaled + forest_prop_scaled +
## dist_road_only_scaled + dist_sett_scaled + remoteness_scaled +
## (1 | id)
## Data: DF_mod_day
##
## AIC BIC logLik deviance df.resid
## 4539.3 4595.9 -2260.7 4521.3 3955
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.1673 -0.7219 -0.3612 0.8565 4.0178
##
## Random effects:
## Groups Name Variance Std.Dev.
## id (Intercept) 0.2387 0.4886
## Number of obs: 3964, groups: id, 12
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.44517 0.14715 -3.025 0.002484 **
## slope_scaled -0.04861 0.06215 -0.782 0.434154
## us_scaled 0.37862 0.04293 8.820 < 2e-16 ***
## os_scaled 0.03339 0.06512 0.513 0.608141
## forest_prop_scaled 0.85465 0.07448 11.474 < 2e-16 ***
## dist_road_only_scaled 0.51282 0.05294 9.687 < 2e-16 ***
## dist_sett_scaled -0.07131 0.06324 -1.127 0.259536
## remoteness_scaled -0.23229 0.06529 -3.558 0.000374 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) slp_sc us_scl os_scl frst__ dst___ dst_s_
## slope_scald 0.019
## us_scaled 0.005 -0.093
## os_scaled 0.004 -0.301 -0.288
## frst_prp_sc -0.077 0.037 0.057 -0.594
## dst_rd_nly_ -0.018 0.089 0.016 0.006 0.187
## dst_stt_scl 0.051 0.014 0.023 0.117 -0.466 -0.113
## rmtnss_scld 0.018 -0.551 -0.006 0.078 -0.201 -0.481 -0.010
Aufgabe 1
Berechung der AUC (area under the receiver operating characteristic curve) = Mass der Modellgüte
Für die Berechnung des AUC findet ihr weiterführende Informationen unter: Link
Code
<- predict(m_day, type = c("response"))
prob <- prediction(prob, DF_mod_day$pres_abs)
pred
?prediction
# AUC
<- performance(pred, measure = "auc")@y.values[[1]]
auc auc
Aufgabe 2
Interpretieren der Modell-Residuen mittels Tests auf verschiedene Aspekte
- Model testing for over/underdispersion, zeroinflation and spatial autocorrelation following the DHARMa package.
- unbedingt die Vignette des DHARMa-Package konsultieren: Link
Code
# Residuals werden über eine Simulation auf eine Standard-Skala transformiert und
# können anschliessend getestet werden. Dabei kann die Anzahl Simulationen eingestellt
# werden (dauert je nach dem sehr lange)
<- simulateResiduals(fittedModel = m_day, n = 10000)
simulationOutput
# plotting and testing scaled residuals
plot(simulationOutput)
testResiduals(simulationOutput)
# The most common concern for GLMMs is overdispersion, underdispersion and
# zero-inflation.
# separate test for dispersion
testDispersion(simulationOutput)
# test for Zeroinflation
testZeroInflation(simulationOutput)
# test for spatial Autocorrelation
# calculating x, y positions per group
<- aggregate(DF_mod_day[, 3:4], list(DF_mod_day$x, DF_mod_day$y), mean)
groupLocations $group <- paste(groupLocations$Group.1, groupLocations$Group.2)
groupLocations<- groupLocations |> dplyr::select(x,y,group)
groupLocations
# calculating residuals per group
= recalculateResiduals(simulationOutput, group = groupLocations$group)
res2
# running the spatial test on grouped residuals
testSpatialAutocorrelation(res2, groupLocations$x, groupLocations$y, plot = F)
# Testen auf Multicollinearität (dh zu starke Korrelationen im finalen Modell, zB falls
# auf Grund der ökologischen Plausibilität stark korrelierte Variablen im Modell)
# use VIF values: if values less then 5 is ok (sometimes > 10), if mean of VIF values
# not substantially greater than 1 (say 5), no need to worry.
::vif(m_day)
carmean(car::vif(m_day))
Aufgabe 3
Graphische Darstellung der Modellresultate
Code
# graphische Darstellung der gesamten Modellresultate
plot_model(m_day, transform = NULL, show.values = TRUE, value.offset = .3)
# Plotten der vorhergesagten Wahrscheinlichkeit, dass ein Kreis besetzt ist, in
# Abhängigkeit der erklärenden Variable basierend auf den Modellresultaten.
plot_model(m_day, type = "pred", terms = "us_scaled [all]")
# Problem: skalierte Variablen lassen sich nicht so ohne weiteres plotten, hier ein quick-
# and-dirty hack um das Problem zu umgehen. Die Einstellungen müssen für jede Variable
# geändert werden
<- plot_model(m_day, type = "pred", terms = "us_scaled [all]")
p
<- round(seq(floor(min(DF_mod_day$us_2014)), ceiling(max(DF_mod_day$us_2014)), length.out = 8), 2)
labels
<- p + scale_x_continuous(breaks = c(-1, 0, 1, 2, 3, 4, 5, 6), labels = c(labels))
p
p
# Funktion um viele Plots auf einem zusammenbringen: cowplot-package (hat auch sonst
# gute Funktionen für schöne Layouts für Grafiken)
::plot_grid() cowplot
Aufgabe 4
Ermittlung des individuellen Beitrags der einzelen Variablen im Gesamtmodell
- Bestimmen delta AIC nach Coppes u. a. (2017) → Vergleich des Gesamtmodells gegenüber einem Modell ohne die entsprechende Variable.
- Auftrag auf nächste Woche: Kurze Vorstellung der Modellresultate & Diagnostics im Plenum und Diskussion der Ergebnisse (keine PP-Präsentation nötig)
Code
<- glmer(pres_abs ~
m_os +
slope_scaled +
us_scaled +
forest_prop_scaled +
dist_road_only_scaled +
dist_sett_scaled +
remoteness_scaled 1 | id), data = DF_mod_day, family = binomial, na.action = "na.fail")
(
<- glmer(pres_abs ~
m_us +
slope_scaled +
forest_prop_scaled +
dist_road_only_scaled +
dist_sett_scaled +
remoteness_scaled +
os_scaled 1 | id), data = DF_mod_day, family = binomial, na.action = "na.fail")
(
<- glmer(pres_abs ~
m_road +
slope_scaled +
forest_prop_scaled +
dist_sett_scaled +
remoteness_scaled +
us_scaled +
os_scaled 1 | id), data = DF_mod_day, family = binomial, na.action = "na.fail")
(
<- glmer(pres_abs ~
m_forest +
dist_road_only_scaled +
slope_scaled +
dist_sett_scaled +
remoteness_scaled +
us_scaled +
os_scaled 1 | id), data = DF_mod_day, family = binomial, na.action = "na.fail")
(
<- glmer(pres_abs ~
m_sett +
dist_road_only_scaled +
slope_scaled +
forest_prop_scaled +
remoteness_scaled +
us_scaled +
os_scaled 1 | id), data = DF_mod_day, family = binomial, na.action = "na.fail")
(
<- glmer(pres_abs ~
m_slope +
dist_road_only_scaled +
forest_prop_scaled +
remoteness_scaled +
dist_sett_scaled +
us_scaled +
os_scaled 1 | id), data = DF_mod_day, family = binomial, na.action = "na.fail")
(
<- glmer(pres_abs ~
m_remote +
dist_road_only_scaled +
forest_prop_scaled +
slope_scaled +
dist_sett_scaled +
us_scaled +
os_scaled 1 | id), data = DF_mod_day, family = binomial, na.action = "na.fail")
(
::AICtab(m_day, m_os, m_us, m_road, m_forest, m_sett, m_slope, m_remote)
bbmle## dAIC df
## m_os 0.0 8
## m_slope 0.3 8
## m_sett 1.0 8
## m_day 1.7 9
## m_remote 12.5 8
## m_us 81.8 8
## m_road 98.1 8
## m_forest 139.5 8