BiEc6_N Modellgüte und -diagnostics MM

Libraries laden

Packages die wir für die Modelle und die Diagnostics brauchen

library("lme4")
library("bbmle")
library("MuMIn")
library("dplyr")
library("readr")
library("ggplot2")
library("DHARMa")
library("car")
library("MASS")
library("ROCR")
library("sjPlot")
library("ggeffects")
library("sjstats")
library("cowplot")
library("gstat")
library("purrr")
library("broom.mixed")

Ausgangslage

Der Modellfit aus Aufgabe 5 von letzter Woche dient als Ausgangspunkt für die heutigen Übungen.

DF_mod_day <- read_delim("datasets/fallstudie_n/Aufgabe4_Datensatz_Habitatnutzung_Modelle_231027_moodle.csv", delim = ";") |>
  filter(time_of_day == "day") |>
  mutate(
    slope_scaled = scale(slope),
    topo_pos_scaled = scale(topo_pos),
    us_scaled = scale(us_2014),
    os_scaled = scale(os_2014),
    forest_prop_scaled = scale(forest_prop),
    dist_road_all_scaled = scale(dist_road_all),
    dist_road_only_scaled = scale(dist_road_only),
    dist_sett_scaled = scale(dist_sett),
    remoteness_scaled = scale(remoteness),
    id = as.factor(id)
  )

f <- pres_abs ~
  slope_scaled +
  us_scaled +
  os_scaled +
  forest_prop_scaled +
  dist_road_only_scaled +
  dist_sett_scaled +
  remoteness_scaled

f <- paste(c(f, "+ (1 | id)"), collapse = " ") |> as.formula()

m_day <- glmer(f, data = DF_mod_day, family = binomial, na.action = "na.fail")

all_m <- dredge(m_day)

avgmodel <- model.avg(all_m, rank = "AICc", subset = delta < 2)
summary(avgmodel)
## 
## Call:
## model.avg(object = get.models(object = all_m, subset = delta < 
##     2), rank = "AICc")
## 
## Component model call: 
## glmer(formula = pres_abs ~ <4 unique rhs>, data = DF_mod_day, family = 
##      binomial, na.action = na.fail)
## 
## Component models: 
##       df   logLik    AICc delta weight
## 1357   6 -2261.70 4535.41  0.00   0.39
## 12357  7 -2261.02 4536.07  0.65   0.28
## 13567  7 -2261.52 4537.06  1.65   0.17
## 13457  7 -2261.60 4537.23  1.82   0.16
## 
## Term codes: 
## dist_road_only_scaled      dist_sett_scaled    forest_prop_scaled 
##                     1                     2                     3 
##             os_scaled     remoteness_scaled          slope_scaled 
##                     4                     5                     6 
##             us_scaled 
##                     7 
## 
## Model-averaged coefficients:  
## (full average) 
##                        Estimate Std. Error Adjusted SE z value Pr(>|z|)    
## (Intercept)           -0.437730   0.148042    0.148087   2.956  0.00312 ** 
## dist_road_only_scaled  0.510519   0.052598    0.052614   9.703  < 2e-16 ***
## forest_prop_scaled     0.844263   0.058809    0.058826  14.352  < 2e-16 ***
## remoteness_scaled     -0.255199   0.056758    0.056775   4.495    7e-06 ***
## us_scaled              0.382072   0.040946    0.040959   9.328  < 2e-16 ***
## dist_sett_scaled      -0.020550   0.046737    0.046744   0.440  0.66021    
## slope_scaled          -0.006037   0.027844    0.027851   0.217  0.82839    
## os_scaled              0.004232   0.026310    0.026317   0.161  0.87225    
##  
## (conditional average) 
##                       Estimate Std. Error Adjusted SE z value Pr(>|z|)    
## (Intercept)           -0.43773    0.14804     0.14809   2.956  0.00312 ** 
## dist_road_only_scaled  0.51052    0.05260     0.05261   9.703  < 2e-16 ***
## forest_prop_scaled     0.84426    0.05881     0.05883  14.352  < 2e-16 ***
## remoteness_scaled     -0.25520    0.05676     0.05678   4.495    7e-06 ***
## us_scaled              0.38207    0.04095     0.04096   9.328  < 2e-16 ***
## dist_sett_scaled      -0.07295    0.06270     0.06272   1.163  0.24479    
## slope_scaled          -0.03532    0.05918     0.05919   0.597  0.55069    
## os_scaled              0.02690    0.06157     0.06159   0.437  0.66226    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Die Modellresultate aus dem avgmodel sind grundätzlich die finalen Resultate die bereits interpretiert werden könnten. Allerdings funktionieren die Diagnosetests und die Darstellung der Resultate mit diesem gemittelten Modell nicht sehr gut, weshalb wir einen re-fit mit glmer machen müssen (an den Resultaten ändert sich dadurch nichts)

# hier zum Vergleich, dass die Resulate sich nur marginal verändern

summary(avgmodel)
## 
## Call:
## model.avg(object = get.models(object = all_m, subset = delta < 
##     2), rank = "AICc")
## 
## Component model call: 
## glmer(formula = pres_abs ~ <4 unique rhs>, data = DF_mod_day, family = 
##      binomial, na.action = na.fail)
## 
## Component models: 
##       df   logLik    AICc delta weight
## 1357   6 -2261.70 4535.41  0.00   0.39
## 12357  7 -2261.02 4536.07  0.65   0.28
## 13567  7 -2261.52 4537.06  1.65   0.17
## 13457  7 -2261.60 4537.23  1.82   0.16
## 
## Term codes: 
## dist_road_only_scaled      dist_sett_scaled    forest_prop_scaled 
##                     1                     2                     3 
##             os_scaled     remoteness_scaled          slope_scaled 
##                     4                     5                     6 
##             us_scaled 
##                     7 
## 
## Model-averaged coefficients:  
## (full average) 
##                        Estimate Std. Error Adjusted SE z value Pr(>|z|)    
## (Intercept)           -0.437730   0.148042    0.148087   2.956  0.00312 ** 
## dist_road_only_scaled  0.510519   0.052598    0.052614   9.703  < 2e-16 ***
## forest_prop_scaled     0.844263   0.058809    0.058826  14.352  < 2e-16 ***
## remoteness_scaled     -0.255199   0.056758    0.056775   4.495    7e-06 ***
## us_scaled              0.382072   0.040946    0.040959   9.328  < 2e-16 ***
## dist_sett_scaled      -0.020550   0.046737    0.046744   0.440  0.66021    
## slope_scaled          -0.006037   0.027844    0.027851   0.217  0.82839    
## os_scaled              0.004232   0.026310    0.026317   0.161  0.87225    
##  
## (conditional average) 
##                       Estimate Std. Error Adjusted SE z value Pr(>|z|)    
## (Intercept)           -0.43773    0.14804     0.14809   2.956  0.00312 ** 
## dist_road_only_scaled  0.51052    0.05260     0.05261   9.703  < 2e-16 ***
## forest_prop_scaled     0.84426    0.05881     0.05883  14.352  < 2e-16 ***
## remoteness_scaled     -0.25520    0.05676     0.05678   4.495    7e-06 ***
## us_scaled              0.38207    0.04095     0.04096   9.328  < 2e-16 ***
## dist_sett_scaled      -0.07295    0.06270     0.06272   1.163  0.24479    
## slope_scaled          -0.03532    0.05918     0.05919   0.597  0.55069    
## os_scaled              0.02690    0.06157     0.06159   0.437  0.66226    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m_day)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## pres_abs ~ slope_scaled + us_scaled + os_scaled + forest_prop_scaled +  
##     dist_road_only_scaled + dist_sett_scaled + remoteness_scaled +  
##     (1 | id)
##    Data: DF_mod_day
## 
##      AIC      BIC   logLik deviance df.resid 
##   4539.3   4595.9  -2260.7   4521.3     3955 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.1673 -0.7219 -0.3612  0.8565  4.0178 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  id     (Intercept) 0.2387   0.4886  
## Number of obs: 3964, groups:  id, 12
## 
## Fixed effects:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -0.44517    0.14715  -3.025 0.002484 ** 
## slope_scaled          -0.04861    0.06215  -0.782 0.434154    
## us_scaled              0.37862    0.04293   8.820  < 2e-16 ***
## os_scaled              0.03339    0.06512   0.513 0.608141    
## forest_prop_scaled     0.85465    0.07448  11.474  < 2e-16 ***
## dist_road_only_scaled  0.51282    0.05294   9.687  < 2e-16 ***
## dist_sett_scaled      -0.07131    0.06324  -1.127 0.259536    
## remoteness_scaled     -0.23229    0.06529  -3.558 0.000374 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) slp_sc us_scl os_scl frst__ dst___ dst_s_
## slope_scald  0.019                                          
## us_scaled    0.005 -0.093                                   
## os_scaled    0.004 -0.301 -0.288                            
## frst_prp_sc -0.077  0.037  0.057 -0.594                     
## dst_rd_nly_ -0.018  0.089  0.016  0.006  0.187              
## dst_stt_scl  0.051  0.014  0.023  0.117 -0.466 -0.113       
## rmtnss_scld  0.018 -0.551 -0.006  0.078 -0.201 -0.481 -0.010

Aufgabe 1

Berechung der AUC (area under the receiver operating characteristic curve) = Mass der Modellgüte

Für die Berechnung des AUC findet ihr weiterführende Informationen unter: Link

Code

prob <- predict(m_day, type = c("response"))
pred <- prediction(prob, DF_mod_day$pres_abs)

?prediction

# AUC

auc <- performance(pred, measure = "auc")@y.values[[1]]
auc

Aufgabe 2

Interpretieren der Modell-Residuen mittels Tests auf verschiedene Aspekte

Model testing for over/underdispersion, zeroinflation and spatial autocorrelation following the DHARMa package.
unbedingt die Vignette des DHARMa-Package konsultieren: Link

Code

# Residuals werden über eine Simulation auf eine Standard-Skala transformiert und
# können anschliessend getestet werden. Dabei kann die Anzahl Simulationen eingestellt
# werden (dauert je nach dem sehr lange)

simulationOutput <- simulateResiduals(fittedModel = m_day, n = 10000)

# plotting and testing scaled residuals

plot(simulationOutput)
testResiduals(simulationOutput)

# The most common concern for GLMMs is overdispersion, underdispersion and
# zero-inflation.

# separate test for dispersion

testDispersion(simulationOutput)

# test for Zeroinflation

testZeroInflation(simulationOutput)

# test for spatial Autocorrelation

# calculating x, y positions per group
groupLocations <- aggregate(DF_mod_day[, 3:4], list(DF_mod_day$x, DF_mod_day$y), mean)
groupLocations$group <- paste(groupLocations$Group.1, groupLocations$Group.2)
groupLocations <- groupLocations |> dplyr::select(x,y,group)



# calculating residuals per group
res2 = recalculateResiduals(simulationOutput, group = groupLocations$group)

# running the spatial test on grouped residuals
testSpatialAutocorrelation(res2, groupLocations$x, groupLocations$y, plot = F)

# Testen auf Multicollinearität (dh zu starke Korrelationen im finalen Modell, zB falls
# auf Grund der ökologischen Plausibilität stark korrelierte Variablen im Modell)
# use VIF values: if values less then 5 is ok (sometimes > 10), if mean of VIF values
# not substantially greater than 1 (say 5), no need to worry.

car::vif(m_day)
mean(car::vif(m_day))

Aufgabe 3

Graphische Darstellung der Modellresultate

Code

# graphische Darstellung der gesamten Modellresultate

plot_model(m_day, transform = NULL, show.values = TRUE, value.offset = .3)

# Plotten der vorhergesagten Wahrscheinlichkeit, dass ein Kreis besetzt ist, in
# Abhängigkeit der erklärenden Variable basierend auf den Modellresultaten.

plot_model(m_day, type = "pred", terms = "us_scaled [all]")

# Problem: skalierte Variablen lassen sich nicht so ohne weiteres plotten, hier ein quick-
# and-dirty hack um das Problem zu umgehen. Die Einstellungen müssen für jede Variable
# geändert werden

p <- plot_model(m_day, type = "pred", terms = "us_scaled [all]")

labels <- round(seq(floor(min(DF_mod_day$us_2014)), ceiling(max(DF_mod_day$us_2014)), length.out = 8), 2)

p <- p + scale_x_continuous(breaks = c(-1, 0, 1, 2, 3, 4, 5, 6), labels = c(labels))

p

# Funktion um viele Plots auf einem zusammenbringen: cowplot-package (hat auch sonst
# gute Funktionen für schöne Layouts für Grafiken)

cowplot::plot_grid()

Aufgabe 4

Ermittlung des individuellen Beitrags der einzelen Variablen im Gesamtmodell

Bestimmen delta AIC nach Coppes u. a. (2017) → Vergleich des Gesamtmodells gegenüber einem Modell ohne die entsprechende Variable.
Auftrag auf nächste Woche: Kurze Vorstellung der Modellresultate & Diagnostics im Plenum und Diskussion der Ergebnisse (keine PP-Präsentation nötig)

Code

m_os <- glmer(pres_abs ~
  slope_scaled +
  us_scaled +
  forest_prop_scaled +
  dist_road_only_scaled +
  dist_sett_scaled +
  remoteness_scaled +
  (1 | id), data = DF_mod_day, family = binomial, na.action = "na.fail")

m_us <- glmer(pres_abs ~
  slope_scaled +
  forest_prop_scaled +
  dist_road_only_scaled +
  dist_sett_scaled +
  remoteness_scaled +
  os_scaled +
  (1 | id), data = DF_mod_day, family = binomial, na.action = "na.fail")

m_road <- glmer(pres_abs ~
  slope_scaled +
  forest_prop_scaled +
  dist_sett_scaled +
  remoteness_scaled +
  us_scaled +
  os_scaled +
  (1 | id), data = DF_mod_day, family = binomial, na.action = "na.fail")

m_forest <- glmer(pres_abs ~
  dist_road_only_scaled +
  slope_scaled +
  dist_sett_scaled +
  remoteness_scaled +
  us_scaled +
  os_scaled +
  (1 | id), data = DF_mod_day, family = binomial, na.action = "na.fail")

m_sett <- glmer(pres_abs ~
  dist_road_only_scaled +
  slope_scaled +
  forest_prop_scaled +
  remoteness_scaled +
  us_scaled +
  os_scaled +
  (1 | id), data = DF_mod_day, family = binomial, na.action = "na.fail")

m_slope <- glmer(pres_abs ~
  dist_road_only_scaled +
  forest_prop_scaled +
  remoteness_scaled +
  dist_sett_scaled +
  us_scaled +
  os_scaled +
  (1 | id), data = DF_mod_day, family = binomial, na.action = "na.fail")

m_remote <- glmer(pres_abs ~
  dist_road_only_scaled +
  forest_prop_scaled +
  slope_scaled +
  dist_sett_scaled +
  us_scaled +
  os_scaled +
  (1 | id), data = DF_mod_day, family = binomial, na.action = "na.fail")

bbmle::AICtab(m_day, m_os, m_us, m_road, m_forest, m_sett, m_slope, m_remote)
##          dAIC  df
## m_os       0.0 8 
## m_slope    0.3 8 
## m_sett     1.0 8 
## m_day      1.7 9 
## m_remote  12.5 8 
## m_us      81.8 8 
## m_road    98.1 8 
## m_forest 139.5 8