Statistik 3: Demo

Korrelation vs. Regression

## Korrelationen und Regressionen

# Datensatz zum Einfluss von Stickstoffdepositionen auf den Pflanzenartenreichtum
library(readr)
library(ggplot2)

df <- read_delim("datasets/stat/Nitrogen.csv", ";")

summary(df)

  N_deposition   Species_richness
 Min.   : 2.00   Min.   :12.0    
 1st Qu.: 9.00   1st Qu.:17.5    
 Median :20.00   Median :21.0    
 Mean   :20.53   Mean   :20.2    
 3rd Qu.:30.50   3rd Qu.:23.0    
 Max.   :55.00   Max.   :28.0

# Plotten der Beziehung
ggplot(df, aes(x = N_deposition, y = Species_richness)) +
  geom_point()

# Pearson Korrelation
# zuerst Species_richness dann N_deposition
cor.test(df$Species_richness, df$N_deposition, method = "pearson")


    Pearson's product-moment correlation

data:  df$Species_richness and df$N_deposition
t = -5.2941, df = 13, p-value = 0.0001453
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.9405572 -0.5450218
sample estimates:
       cor 
-0.8265238

# Pearson Korrelation
# zuerst Species_richness dann N_deposition
cor.test(df$N_deposition, df$Species_richness, method = "pearson") # zuerst N_deposition dann Species_richness


    Pearson's product-moment correlation

data:  df$N_deposition and df$Species_richness
t = -5.2941, df = 13, p-value = 0.0001453
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.9405572 -0.5450218
sample estimates:
       cor 
-0.8265238

# Rang-Korrelation Spearman
cor.test(df$Species_richness, df$N_deposition, method = "spearman")


    Spearman's rank correlation rho

data:  df$Species_richness and df$N_deposition
S = 1015.5, p-value = 0.0002259
alternative hypothesis: true rho is not equal to 0
sample estimates:
       rho 
-0.8133721

# Rang-Korrelation Kendall
cor.test(df$Species_richness, df$N_deposition, method = "kendall")


    Kendall's rank correlation tau

data:  df$Species_richness and df$N_deposition
z = -3.308, p-value = 0.0009398
alternative hypothesis: true tau is not equal to 0
sample estimates:
      tau 
-0.657115

# Jetzt als Regression
lm1 <- lm(Species_richness ~ N_deposition, data = df) # zuerst Species_richness dann N_deposition
lm1


Call:
lm(formula = Species_richness ~ N_deposition, data = df)

Coefficients:
 (Intercept)  N_deposition  
     25.6050       -0.2632

lm2 <- lm(N_deposition ~ Species_richness, data = df) # zuerst N_deposition dann Species_richness  
lm2


Call:
lm(formula = N_deposition ~ Species_richness, data = df)

Coefficients:
     (Intercept)  Species_richness  
          72.957            -2.595

anova(lm1) # ANOVA-Tabelle, 1. Möglichkeit

Analysis of Variance Table

Response: Species_richness
             Df Sum Sq Mean Sq F value    Pr(>F)    
N_deposition  1 233.91 233.908  28.028 0.0001453 ***
Residuals    13 108.49   8.346                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary.aov(lm1) # ANOVA-Tabelle, 2. Möglichkeit

             Df Sum Sq Mean Sq F value   Pr(>F)    
N_deposition  1  233.9  233.91   28.03 0.000145 ***
Residuals    13  108.5    8.35                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(lm1) # Regressionskoeffizienten


Call:
lm(formula = Species_richness ~ N_deposition, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.9184 -1.9992  0.4493  2.0015  4.6081 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  25.60502    1.26440  20.251 3.25e-11 ***
N_deposition -0.26323    0.04972  -5.294 0.000145 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.889 on 13 degrees of freedom
Multiple R-squared:  0.6831,    Adjusted R-squared:  0.6588 
F-statistic: 28.03 on 1 and 13 DF,  p-value: 0.0001453

# Signifikantes Ergebnis visualisieren

ggplot(df, aes(x = N_deposition, y = Species_richness)) +
  geom_point() +
  geom_abline(intercept = lm1$coefficients[1], slope = lm1$coefficients[2], color = "blue")

#BaseR Variante plot(Species_richness ~ N_deposition, data = df) + abline(lm1)

Einfache und Polynomische Regression

# Daten generieren 
pred <- c(20, 19, 25, 10, 8, 15, 13, 18, 11, 14, 25, 39, 38, 28, 24) # "pred" sei unsere unabhängige Variable
resp <- c(12, 15, 10, 7, 2, 10, 12, 11, 13, 10, 9, 2, 4, 7, 13) # "resp" sei unsere abhängige Variable
data <- data.frame(pred, resp) # Dataframe erstellen

# Daten anschauen
ggplot(data, aes(x = pred, y = resp)) +
  geom_point()

# Modell definieren ud anschauen
lm_1 <- lm(resp ~ pred) # Einfaches lineares Modell
summary(lm_1) # Modell anschauen


Call:
lm(formula = resp ~ pred)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.0549 -1.7015  0.5654  2.0617  5.6406 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  12.2879     2.4472   5.021 0.000234 ***
pred         -0.1541     0.1092  -1.412 0.181538    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.863 on 13 degrees of freedom
Multiple R-squared:  0.1329,    Adjusted R-squared:  0.06622 
F-statistic: 1.993 on 1 and 13 DF,  p-value: 0.1815

-> kein signifikanter Zusammenhang im einfachen linearen Modell und entsprechend kleines Bestimmtheitsmass (adj. R² = 0.07)

# Polynomische Regression Modell definieren und anschauen 
lm_quad <- lm(resp ~ pred + I(pred^2)) # lineares Modell mit quadratischem Termsummary

summary(lm_quad) # lineares Modell mit quadratischem Term anschauen


Call:
lm(formula = resp ~ pred + I(pred^2))

Residuals:
    Min      1Q  Median      3Q     Max 
-4.3866 -1.1018 -0.2027  1.3831  4.4211 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept) -2.239308   3.811746  -0.587  0.56777   
pred         1.330933   0.360105   3.696  0.00306 **
I(pred^2)   -0.031587   0.007504  -4.209  0.00121 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.555 on 12 degrees of freedom
Multiple R-squared:  0.6499,    Adjusted R-squared:  0.5915 
F-statistic: 11.14 on 2 and 12 DF,  p-value: 0.001842

-> Signifikanter Zusammenhang und viel besseres Bestimmtheitsmass (adj. R² = 0.60)

# Modelle darstellen

# Vorhersagen der Modelle generieren 
xv <- seq(min(pred), max(pred), length = 100) # 100 x-Werte, mit denen man die Modelle "füttern" kann 
y_lm1 <- predict(lm_1, data.frame(pred = xv)) # Vorhersagen des quadratischen Modells für die y-Werte
y_lm_quad <- predict(lm_quad, data.frame(pred = xv)) # Vorhersagen des quadratischen Modells für die y-Werte
ModPred <- data.frame(xv, y_lm1, y_lm_quad)

# Modellvorhersagen plotten

ggplot(data, aes(x = pred, y = resp)) +
  geom_point() +
  geom_line(data = ModPred, aes(x = xv, y = y_lm1), color = "red", linetype = "dashed") + 
  geom_line(data = ModPred, aes(x = xv, y = y_lm_quad), color = "blue")

# Alternativ kann man die Modelle Modellvorhersagen auch direkt in ggplot rechnen
# + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) # Einfache Lineare Regression
# + geom_smooth(method = "lm", formula = y ~ x + I(x^2), se = FALSE) # Mit quadratischem Term

# Residualplots
par(mfrow = c(2, 2))
plot(lm_1, main = "Lineares Modell")

plot(lm_quad, main = "Quadratisches  Modell")

-> Die Plots sehen beim Modell mit quadratischem Term besser aus

ANCOVA

Experiment zur Fruchtproduktion (“Fruit”) von Ipomopsis sp. in Abhängigkeit von der Beweidung (“Grazing” mit 2 Levels: “Grazed”, “Ungrazed”) und korrigiert für die Pflanzengrösse vor der Beweidung (hier ausgedrückt als Durchmesser an der Spitze des Wurzelstock: “Root”)

# Daten einlesen und anschauen
library("readr")

compensation <- read_delim("datasets/stat/ipomopsis.csv")
compensation$Grazing <- as.factor(compensation$Grazing)

head(compensation)

# A tibble: 6 × 3
   Root Fruit Grazing 
  <dbl> <dbl> <fct>   
1  6.22  59.8 Ungrazed
2  6.49  61.0 Ungrazed
3  4.92  14.7 Ungrazed
4  5.13  19.3 Ungrazed
5  5.42  34.2 Ungrazed
6  5.36  35.5 Ungrazed

summary(compensation)

      Root            Fruit            Grazing  
 Min.   : 4.426   Min.   : 14.73   Grazed  :20  
 1st Qu.: 6.083   1st Qu.: 41.15   Ungrazed:20  
 Median : 7.123   Median : 60.88                
 Mean   : 7.181   Mean   : 59.41                
 3rd Qu.: 8.510   3rd Qu.: 76.19                
 Max.   :10.253   Max.   :116.05

# Plotten der vollständigen Daten/Information
library("ggplot2")
ggplot(compensation, aes(x = Root, y = Fruit, color = Grazing)) +
  geom_point()

-> Je grösser die Pflanze, desto grösser ihre Fruchtproduktion. -> Die grössere Fruchtproduktion innerhalb der beweideten Gruppe scheint ein Resultat von unterschiedlichen Pflanzengrössen zwischen den Gruppen zu sein.

# Lineare Modelle definieren und anschauen

aoc_1 <- lm(Fruit ~ Root * Grazing, data = compensation) # Volles Modell mit Interaktion
summary.aov(aoc_1)

             Df Sum Sq Mean Sq F value   Pr(>F)    
Root          1  16795   16795 359.968  < 2e-16 ***
Grazing       1   5264    5264 112.832 1.21e-12 ***
Root:Grazing  1      5       5   0.103     0.75    
Residuals    36   1680      47                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aoc_2 <- lm(Fruit ~ Grazing + Root, data = compensation) # Finales Modell ohne die (nicht signifikante) Interaktion
summary.aov(aoc_2) # ANOVA-Tabelle

            Df Sum Sq Mean Sq F value  Pr(>F)    
Grazing      1   2910    2910   63.93 1.4e-09 ***
Root         1  19149   19149  420.62 < 2e-16 ***
Residuals   37   1684      46                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(aoc_2) # Parameter-Tabelle


Call:
lm(formula = Fruit ~ Grazing + Root, data = compensation)

Residuals:
     Min       1Q   Median       3Q      Max 
-17.1920  -2.8224   0.3223   3.9144  17.3290 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -127.829      9.664  -13.23 1.35e-15 ***
GrazingUngrazed   36.103      3.357   10.75 6.11e-13 ***
Root              23.560      1.149   20.51  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.747 on 37 degrees of freedom
Multiple R-squared:  0.9291,    Adjusted R-squared:  0.9252 
F-statistic: 242.3 on 2 and 37 DF,  p-value: < 2.2e-16

# Residualplots anschauen
par(mfrow = c(2, 2)) # 2x2 Plots pro Grafik
plot(aoc_2)

par(mfrow = c(1, 1)) # Grafik zurücksetzen

-> Das ANCOVA-Modell widerspiegelt die Zusammenhänge wie sie aufgrund der grafisch dargestellten Daten zu vermuten sind gut. Die Residual-Plots zeigen 3 Ausreisser (Beobachtungen 27, 34 und 37), welche “aus der Reihe tanzen”.