Tag 5 Korrelation und Regression

5.1 Lösung Tag 4

siehe Tag 4.

5.2 Korrelation und Regression

https://figshare.com/articles/Storks_and_human_babies_data/839299/1 oder https://github.com/jannikbuhr/dataIntro19/tree/master/data

library(tidyverse)
library(broom)

storks <- read_csv("data/05_storks.csv")
storks

ggplot(storks, aes(Storks, Birth)) +
  geom_point()

# explain log transform and relevance in biology

ggplot(storks, aes(Storks, Birth)) +
  geom_point() +
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(trans = "log10") +
  annotation_logticks() +
  theme_classic()

model <- lm(Birth ~ Storks, data = storks)
model
summary(model)

ggplot(storks, aes(Storks, Birth)) +
  geom_point() +
  scale_x_continuous(trans = "log10", breaks = scales::log_breaks()) +
  scale_y_continuous(trans = "log10", breaks = scales::log_breaks()) +
  annotation_logticks() +
  theme_classic() +
  geom_smooth(method = "lm")

model <- lm(log(Birth) ~ log(Storks), data = storks)
model

summary(model)

# explain correlation
# explain range of correlation coefficient
cor(storks$Storks, storks$Birth)

cor(log(storks$Storks), log(storks$Birth))

cor.test(storks$Storks, storks$Birth)

cor(storks$Storks, storks$Birth, method = "spearman")

model <- lm(storks$Birth ~ storks$Storks)
summary(model)

cor(storks$Storks, storks$Birth)^2

with(storks, {
  plot(Storks, Birth)
  abline(lm(Birth~Storks))
})

with(storks, {
  plot(log(Storks), log(Birth))
  abline(lm(log(Birth)~log(Storks)))
})

# show xkcd
# talk about confounding factors / variables
storks

5.3 Tricks für viele Daten / Dateien

paths <- dir("data", full.names = TRUE, pattern = ".csv")
map(paths, read_csv)
# map_dfr

5.4 Non-Linear Regression

Puromycin <- as_tibble(Puromycin)
Puromycin

Puromycin %>% count(state)

puro_treat <- Puromycin %>% 
  filter(state == "treated")

plot(puro_treat$conc, puro_treat$rate)

ggplot(puro_treat, aes(conc, rate)) + geom_point()

michaelis_menten_fun <- function(conc, Vm, K) {
  (Vm * conc) / (K + conc)
}

x = seq(0, 1, by = 0.001)
y = michaelis_menten_fun(x, Vm = 200, K = 0.2)
plot(x, y)

plot(puro_treat$conc, puro_treat$rate)
curve(michaelis_menten_fun(conc = x, Vm = 200, K = 0.1),
      add = TRUE,
      col = "red",
      from = 0, to = 1,
      n = 100)

model <- nls(rate ~ michaelis_menten_fun(conc, Vm, K),
             start = list(Vm = 200, K = 0.1),
             data  = puro_treat)
model

coef(model)

summary(model)$coefficients

Vm_est <- coef(model)[1]
K_est  <- coef(model)[2]

plot(puro_treat$conc, puro_treat$rate)
curve(michaelis_menten_fun(x, Vm = Vm_est, K = K_est),
      add = TRUE,
      col = "red")

ggplot(puro_treat, aes(conc, rate)) +
  geom_point() +
  stat_function(fun = ~ michaelis_menten_fun(conc = .x, Vm = Vm_est, K = K_est),
                col = "red") +
  theme_classic()

model2 <- nls(rate ~ SSmicmen(conc, Vm, K),
             data  = puro_treat)
model2

predict(model2, newdata = list(conc = 1:10))

nested_data <- Puromycin %>% 
  group_nest(state)
nested_data

nested_data$data[[1]]

all_models <- nested_data %>% 
  mutate(model = map(data, ~ nls(rate ~ michaelis_menten_fun(conc, Vm, K),
                                start = list(Vm = 200, K = 0.1),
                                data = .x))
  )

all_models

fit_my_model <- function(df) {
  nls(rate ~ michaelis_menten_fun(conc, Vm, K),
                                start = list(Vm = 200, K = 0.1),
                                data = df)
}

fit_my_model(puro_treat)

all_models <- nested_data %>% 
  mutate(model = map(data, fit_my_model)
  )

all_models

all_models$model[[1]]

all_models <- all_models %>% 
  mutate(fitted = map(model, augment),
         coef   = map(model, coef))

all_models

all_models %>% 
  unnest_wider(coef)

all_models %>% 
  select(state, fitted) %>% 
  unnest(fitted)

all_models %>% 
  select(state, fitted) %>% 
  unnest(fitted) %>% 
  ggplot(aes(conc, rate, color = state)) +
  geom_point() +
  geom_line(aes(y = .fitted))

make_data <- function(model, from, to, n) {
  conc    = seq(from, to, length.out = n)
  .fitted = predict(model, newdata = list(conc = conc))
  tibble(conc, .fitted)
}

new_data <- all_models %>% 
  mutate(new_data = map(model, make_data, 0, 1, 100)) %>% 
  select(state, new_data) %>% 
  unnest(new_data)

all_models %>% 
  select(state, fitted) %>% 
  unnest(fitted) %>% 
  ggplot(aes(conc, rate, color = state)) +
  geom_point() +
  geom_line(data = new_data, aes(y = .fitted))

5.5 Übung

5.5.1 Mit den Datasaurus Dozen Datensets

datasauRus::datasaurus_dozen

Visualisiere alle Sets gemeinsam in einem ggplot Scatterplot, nutze dazu facet_wrap.
Füge mittels geom_smooth lineare Trendlinien hinzu
Fitte eine Lineare Regression and jedes der Datensets. Nutze dazu die Techniken aus R4DataScience: Many Models und das broom package.
Analysiere die Fits.