-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscratch.R
More file actions
122 lines (82 loc) · 3.41 KB
/
scratch.R
File metadata and controls
122 lines (82 loc) · 3.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
plant_data <- read.csv(file = "two_discrete_x_values.csv");
species_1 <- plant_data$height[plant_data$species_ID == "species_1"];
species_2 <- plant_data$height[plant_data$species_ID == "species_2"];
hist(species_1, breaks = 10, col = "blue", xlim = c(0, 300),
main = "", xlab = "Plant height", cex.lab = 1.25,
ylim = c(0, 15));
hist(species_2, breaks = 10, col = "red", add = TRUE)
#legend(x = 0, y = 15, fill = c("red", "blue"),
# legend = c("species_2", "species_1"));
n1 <- length(species_1);
n2 <- length(species_2);
var_y1 <- var(species_1);
var_y2 <- var(species_2);
s_p <- sqrt(((n1 - 1)*var_y1 + (n2 - 1)*var_y2)/(n1 + n2 - 2));
SEybar <- s_p * sqrt((n1 + n2) / (n1*n2));
tval <- (mean(species_1) - mean(species_2)) / SEybar;
#################
################# Using the t-test function in R and lm
#################
t.test(species_1, species_2, var.equal = TRUE);
# Use a linear model instead to get the same numbers
lmod1 <- lm(height ~ 1 + species_ID, data = plant_data);
summary(lmod1);
#################
################# How R sees the linear model and t-test
#################
is_species_2 <- as.numeric(plant_data$species_ID == "species_2");
plant_height <- plant_data$height;
lm_table_eg <- data.frame(plant_height, is_species_2);
#################
################# Thinking about this in a plot
#################
plot(x = lm_table_eg$is_species_2, y = lm_table_eg$plant_height,
ylim = c(0, 275), pch = 20, cex.axis = 1.25, cex.lab = 1.25,
ylab = "Plant height", xlab = "Is species 2: Yes (1) or no (0)",
xlim = c(0, 1));
lines(x = c(0, 1), y = c(summary(lmod1)$coefficients[1,1],
summary(lmod1)$coefficients[1,1] +
summary(lmod1)$coefficients[2,1]),
col = "red", lwd = 4);
points(x = 0, y = summary(lmod1)$coefficients[1,1], pch = 17,
col = "blue", cex = 3);
points(x = 1, y = summary(lmod1)$coefficients[1,1] +
summary(lmod1)$coefficients[2,1], pch = 18, col = "orange", cex = 3);
#################
################# Analysis of variance with two categories
#################
aov_1 <- aov(height ~ species_ID, data = plant_data);
summary(aov_1);
#################
################# What happens when we have three categories? ANOVA
#################
plant_data <- read.csv(file = "three_discrete_x_values.csv");
aov_2 <- aov(height ~ species_ID, data = plant_data);
summary(aov_2);
#################
################# Three categories with a linear model
#################
lmod2 <- lm(height ~ 1 + species_ID, data = plant_data);
summary(lmod2);
# Can we check out the mean values?
tapply(X = plant_data$height, INDEX = plant_data$species_ID,
FUN = mean);
#################
################# What's really going on with 3 groups?
#################
the_intercept <- rep(x = 1, length = length(plant_data$height));
is_species_2 <- as.numeric(plant_data$species_ID == "species_2");
is_species_3 <- as.numeric(plant_data$species_ID == "species_3");
lm_table_eg <- data.frame(plant_data$height, the_intercept,
is_species_2, is_species_3);
head(lm_table_eg);
#################
################# Even more elegant, using matrices!
#################
Y <- as.matrix(lm_table_eg[,1]);
X <- as.matrix(lm_table_eg[,2:4]);
# Now solve for Beta in: Y = X * Beta
betas <- solve( t(X) %*% X ) %*% t(X) %*% Y;
print(betas);
# Compare back to this
summary(lmod2);