# R code for module activity
## 1- Install and load the main packages
#### Install packages
If any of the below packages is not installed on your computer, please install it. Remember to delete the **#** symbol before running the code.
```{r, eval=FALSE}
#install.packages("tidyverse")
#install.packages("ggplot2")
#install.packages("dplyr")
#install.packages("ggrepel")
#install.packages("patchwork")
#install.packages("gridExtra")
```
#### Load packages
```{r, eval=FALSE}
library(tidyverse)
library(ggplot2)
library(dplyr)
library(ggrepel)
library(patchwork)
library(gridExtra)
library(haven)
```
## 2- Load the dataset and make a copy
```{r, eval=FALSE}
publicmicrodatateachingsample <- read_sav("data/publicmicrodatateachingsample.sav")
census2021teaching <- publicmicrodatateachingsample
```
Important: Remember to change to pathway to your dataset accordingly.
If the **read_sav** code does not work, you can use the **Import Dataset** button in the Environment pane to load the dataset in R.
## 3- Drop unnecessary variables
```{r, eval=FALSE}
census2021teaching <- census2021teaching[,c("health_in_general",
"hours_per_week_worked",
"resident_age_7d","sex",
"ethnic_group_tb_6a",
"approx_social_grade")]
```
## 4- Exploratory analysis
### 4.1- Univariate analysis
#### 4.1.1 Univariate analysis for health in general and hours worked
##### Frequencies for health in general and hours worked
```{r, eval=FALSE}
table(census2021teaching$health_in_general)
table(census2021teaching$hours_per_week_worked)
```
##### Percentages for health in general and hours worked
```{r, eval=FALSE}
prop.table(table(census2021teaching$health_in_general)) * 100
prop.table(table(census2021teaching$hours_per_week_worked)) * 100
```
You can also calculate these frequencies and percentages with appropriate labels.
##### Frequencies and percentages with labels for health in general and hours worked
```{r, eval=FALSE}
census2021teaching %>%
mutate(
health_code = as.integer(as.character(health_in_general)),
health_label = factor(
health_code,
levels = c(1, 2, 3, 4, 5, -8),
labels = c("Very good", "Good", "Fair", "Bad", "Very bad", "Does not apply")
)
) %>%
count(health_label) %>%
mutate(percent = round(100 * n / sum(n), 1))
census2021teaching %>%
mutate(
hours_code = as.integer(as.character(hours_per_week_worked)),
hours_label = factor(
hours_code,
levels = c(1, 2, 3, 4, -8),
labels = c("[0 – 15]", "[16 – 30]", "[31 – 48]", "[49 and +]", "Does not apply")
)
) %>%
count(hours_label) %>%
mutate(percent = round(100 * n / sum(n), 1))
```
You can also generate pie charts for better visualization
### a) Pie chart for Health in general
##### a.1) Create a labelled health variable
```{r, eval=FALSE}
health <- census2021teaching %>%
mutate(
health_code = as.integer(as.character(health_in_general)),
health_label = factor(
health_code,
levels = c(1,2,3,4,5,-8),
labels = c("Very good", "Good", "Fair", "Bad", "Very bad", "Not applicable")
)
) %>%
filter(!is.na(health_label)) %>%
count(health_label, name = "n") %>%
mutate(
pct = round(100 * n / sum(n), 1),
label = paste0(health_label, " (", pct, "%)")
)
```
##### a.2) Plot pie chart
```{r, eval=FALSE}
p1 <- ggplot(health, aes(x = "", y = n, fill = health_label)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y") +
geom_label_repel(aes(label = label),
position = position_stack(vjust = 0.5),
show.legend = FALSE,
size = 3) +
labs(title = "Distribution of general health", fill = "Health status") +
theme_void()+
scale_fill_brewer(palette = "Set2") +
theme(plot.title = element_text(size = 16, hjust = 1))
p1
```
### b) Pie chart for Hours worked
##### b.1) Create a labelled hours worked variable
```{r, eval=FALSE}
hours <- census2021teaching %>%
mutate(
hours_code = as.integer(as.character(hours_per_week_worked)),
hours_label = factor(
hours_code,
levels = c(1, 2, 3, 4, -8),
labels = c("[0 – 15]", "[16 – 30]", "[31 – 48]", "[49 and +]", "Not applicable")
)
) %>%
filter(!is.na(hours_label)) %>%
count(hours_label, name = "n") %>%
mutate(
pct = round(100 * n / sum(n), 1),
label = paste0(hours_label, " (", pct, "%)")
)
```
##### b.2) Plot pie chart
```{r, eval=FALSE}
p2 <- ggplot(hours, aes(x = "", y = n, fill = hours_label)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y") +
geom_label_repel(aes(label = label),
position = position_stack(vjust = 0.5),
show.legend = FALSE,
size = 3) +
labs(title = "Distribution of worked hours", fill = "Hours worked") +
theme_void()+
scale_fill_brewer(palette = "Dark2") +
theme(plot.title = element_text(size = 16, hjust = 1))
p2
```
### 4.1.1.1 Transformation of variable Health in general
Create a new variable called **health_binary** by regrouping health in general in two broader categories: **Good or very good health** and **Poor health**
```{r, eval=FALSE}
census2021teaching$health_binary <- ifelse(
census2021teaching$health_in_general %in% c(1, 2), "Good or very good health",
ifelse(census2021teaching$health_in_general %in% c(3, 4, 5), "Poor health", NA)
)
```
##### Drop rows where health_in_general = -8 (Does not apply)
```{r, eval=FALSE}
census2021teaching <- subset(census2021teaching, health_in_general != -8)
```
##### Check the new variable's distribution
```{r, eval=FALSE}
table(census2021teaching$health_binary)
```
##### Create variable **Poor_health** and check its distribution
Code a binary variable that takes 1 for poor health and 0 otherwise. Call this new variable **Poor_health**. This variable will be the explained variable of our regression model.
```{r, eval=FALSE}
census2021teaching$Poor_health <- ifelse(
census2021teaching$health_binary == "Poor health", 1,
ifelse(census2021teaching$health_binary == "Good or very good health", 0, NA)
)
table(census2021teaching$Poor_health)
```
### 4.1.2 Univariate analysis for age, sex, ethnicity, social category
#### Frequencies and percentages for age, sex, ethnicity, social category
```{r, eval=FALSE}
table(census2021teaching$resident_age_7d)
table(census2021teaching$sex)
table(census2021teaching$ethnic_group_tb_6a)
table(census2021teaching$approx_social_grade)
prop.table(table(census2021teaching$resident_age_7d)) * 100
prop.table(table(census2021teaching$sex)) * 100
prop.table(table(census2021teaching$ethnic_group_tb_6a)) * 100
prop.table(table(census2021teaching$approx_social_grade)) * 100
```
#### Plot pie chart Age variable
##### a) Create a labelled Age variable
```{r, eval=FALSE}
Age <- census2021teaching %>%
mutate(
Age_code = as.integer(as.character(resident_age_7d)),
Age_label = factor(
Age_code,
levels = c(1, 2, 3, 4,5,6,7, -8),
labels = c("]0 – 15]", "[16 – 24]", "[25 – 34]", "[35 - 44]", "[45 - 54]", "[55 - 64]", "[65 and +]","Not applicable")
)
) %>%
filter(!is.na(Age_label)) %>%
count(Age_label, name = "n") %>%
mutate(
pct = round(100 * n / sum(n), 1),
label = paste0(Age_label, " (", pct, "%)")
)
```
##### b) Plot pie chart
```{r, eval=FALSE}
ggplot(Age, aes(x = "", y = n, fill = Age_label)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y") +
geom_label_repel(aes(label = label),
position = position_stack(vjust = 0.5),
show.legend = FALSE,
size = 3) +
labs(title = "Distribution of respondents by Age", fill = "Resident age") +
theme_void()+
scale_fill_brewer(palette = "Pastel1") +
theme(plot.title = element_text(size = 16, hjust = 1))
```
#### Histogram for Sex variable
##### a) Recode sex variable with labels
```{r, eval=FALSE}
sex <- census2021teaching %>%
mutate(sex_label = factor(sex, levels = c(1, 2), labels = c("Male", "Female"))) %>%
count(sex_label, name = "n") %>%
mutate(prop = n / sum(n),
label = scales::percent(prop, accuracy = 0.1))
```
##### b) Plot histogram
```{r, eval=FALSE}
ggplot(sex, aes(x = sex_label, y = prop)) +
geom_col(fill = "skyblue", color = "black", width = 0.3) +
geom_text(aes(label = label), vjust = -0.5) +
scale_y_continuous(labels = scales::percent_format()) +
scale_x_discrete(expand = expansion(mult = c(0.5, 0.7))) +
labs(title = "Distribution of respondents by sex", x = "", y = "Percentage") +
theme_classic()+
theme(axis.text.x = element_text(size = 12))+
theme(plot.title = element_text(size = 16, hjust = 0.5))
```
#### Histogram for Ethnicity variable
##### a) Recode Ethnicity variable with labels
```{r, eval=FALSE}
Ethnicity <- census2021teaching %>%
mutate(Ethnicity_label = factor(ethnic_group_tb_6a, levels = c(1, 2, 3, 4, 5, -8),
labels = c("Asian", "Black", "Mixed", "White", "Other", "Does not apply"))) %>%
count(Ethnicity_label, name = "n") %>%
mutate(prop = n / sum(n),
label = scales::percent(prop, accuracy = 0.1))
```
##### b) Plot histogram
```{r, eval=FALSE}
ggplot(Ethnicity, aes(x = Ethnicity_label, y = prop)) +
geom_col(fill = "chocolate", color = "black", width = 0.3) +
geom_text(aes(label = label), vjust = -0.5) +
scale_y_continuous(labels = scales::percent_format()) +
labs(title = "Distribution of respondents by Ethnicity", x = "", y = "Percentage") +
theme_classic()+
theme(axis.text.x = element_text(size = 12))+
theme(plot.title = element_text(size = 16, hjust = 0.5))
```
#### Pie chart for variable social class
##### a) Create a labelled social class variable
```{r, eval=FALSE}
Social_class <- census2021teaching %>%
mutate(
Social_class_code = as.integer(as.character(approx_social_grade)),
Social_class_label = factor(
Social_class_code,
levels = c(1, 2, 3, 4, -8),
labels = c("Higher, intermediate managers", "Supervisory, junior managers",
"Skilled manual", "Semi-skilled, unskilled manual","Not applicable")
)
) %>%
filter(!is.na(Social_class_label)) %>%
count(Social_class_label, name = "n") %>%
mutate(
pct = round(100 * n / sum(n), 1),
label = paste0(Social_class_label, " (", pct, "%)")
)
```
##### b) Plot pie chart
```{r, eval=FALSE}
ggplot(Social_class, aes(x = "", y = n, fill = Social_class_label)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y") +
geom_label_repel(aes(label = label),
position = position_stack(vjust = 0.5),
show.legend = FALSE,
size = 3) +
labs(title = "Distribution of respondents by Social class", fill = "Social class") +
theme_void()+
scale_fill_brewer(palette = "Pastel2") +
theme(plot.title = element_text(size = 16, hjust = 1))
```
##### Drop under 16 from the table
```{r, eval=FALSE}
census2021teaching <- subset(census2021teaching, resident_age_7d != 1)
```
### 4.2- Bivariate analysis
##### Cross tabulation of Poor_health by hours worked
```{r, eval=FALSE}
tabulation <- table(census2021teaching$Poor_health, census2021teaching$hours_per_week_worked)
tabulation
```
##### Row percentages
```{r, eval=FALSE}
prop.table(tabulation, margin = 1) * 100
```
##### Column percentages
```{r, eval=FALSE}
prop.table(tabulation, margin = 2) * 100
```
#### Chi square test
##### a) Creating cross tabulations to be tested
```{r, eval=FALSE}
tabulation.health_hours <- table(census2021teaching$Poor_health, census2021teaching$hours_per_week_worked)
tabulation.health_age <- table(census2021teaching$Poor_health, census2021teaching$resident_age_7d)
tabulation.health_sex <- table(census2021teaching$Poor_health, census2021teaching$sex)
tabulation.health_ethnicity <- table(census2021teaching$Poor_health, census2021teaching$ethnic_group_tb_6a)
tabulation.health_classes <- table(census2021teaching$Poor_health, census2021teaching$approx_social_grade)
```
##### b) chi square test on health and our 5 other variables
```{r, eval=FALSE}
chisq.test(tabulation.health_hours)
chisq.test(tabulation.health_age)
chisq.test(tabulation.health_sex)
chisq.test(tabulation.health_ethnicity)
chisq.test(tabulation.health_classes)
```
## 5 Regression model
### 5.1 Recode variables as factors with labels
```{r, eval=FALSE}
census2021teaching$hours_per_week_worked <- factor(
census2021teaching$hours_per_week_worked,
levels = c(1, 2, 3, 4, -8),
labels = c("0-15", "16-30", "31-48", "49+", "Does not apply")
)
census2021teaching$resident_age_7d <- factor(
census2021teaching$resident_age_7d,
levels = c(1, 2, 3, 4, 5, 6, 7, -8),
labels = c("0-15", "16-24", "25-34", "35-44", "45-54", "55-64", "65+", "Not applicable")
)
census2021teaching$ethnic_group_tb_6a <- factor(
census2021teaching$ethnic_group_tb_6a,
levels = c(1, 2, 3, 4, 5, -8),
labels = c("Asian", "Black", "Mixed", "White", "Other", "Does not apply")
)
census2021teaching$approx_social_grade <- factor(
census2021teaching$approx_social_grade,
levels = c(1, 2, 3, 4, -8),
labels = c("Higher, intermediate managers",
"Supervisory, junior managers",
"Skilled manual",
"Semi-skilled, unskilled manual",
"Not applicable")
)
census2021teaching$sex <- factor(
census2021teaching$sex,
levels = c(1, 2),
labels = c("Male", "Female")
)
```
### 5.2 Pick references that are central or policy relevant
```{r, eval=FALSE}
census2021teaching$hours_per_week_worked <- relevel(census2021teaching$hours_per_week_worked, ref = "31-48")
census2021teaching$resident_age_7d <- relevel(census2021teaching$resident_age_7d, ref = "45-54")
census2021teaching$sex <- relevel(census2021teaching$sex, ref = "Male")
census2021teaching$ethnic_group_tb_6a <- relevel(census2021teaching$ethnic_group_tb_6a, ref = "White")
census2021teaching$approx_social_grade <- relevel(census2021teaching$approx_social_grade, ref = "Higher, intermediate managers")
```
### 5.3 Fit logistic regression
```{r, eval=FALSE}
model_health <- glm(
Poor_health ~ hours_per_week_worked + resident_age_7d + sex + ethnic_group_tb_6a + approx_social_grade,
data = census2021teaching,
family = binomial(link = "logit")
)
```
#### Obtain summary results of the logistic regression
```{r, eval=FALSE}
summary(model_health)
```