library(zoomstudentengagement)
library(dplyr)
library(ggplot2)
This vignette shows how to manage student rosters and clean name mismatches between Zoom transcripts and official enrollment data.
Load a student roster from a CSV file:
# Load the sample roster
<- system.file("extdata", package = "zoomstudentengagement")
data_folder
<- load_roster(
roster_df data_folder = data_folder,
roster_file = "roster.csv"
)#> Rows: 7 Columns: 22
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (14): first_last, last_first, last_name, first_name, role, email_address...
#> dbl (6): student_id, user_id, lec, units, course, section
#> lgl (2): waitlist_position, enrolled
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View the roster structure
head(roster_df)
#> # A tibble: 6 × 22
#> student_id first_last last_first last_name first_name user_id role
#> <dbl> <chr> <chr> <chr> <chr> <dbl> <chr>
#> 1 9990000019 Melissa Ko Ko, Melissa Ko Melissa 9990019 Stud…
#> 2 9990000020 Ryan Sloan Sloan, Ryan Sloan Ryan 9990020 Stud…
#> 3 9990000021 Shreeharsh Kelkar Kelkar, Shr… Kelkar Shreeharsh 9990021 Stud…
#> 4 9990000022 Srijani Ghosh Ghosh, Srij… Ghosh Srijani 9990022 Stud…
#> 5 9990000023 Khalilah Beal-Uribe Beal-Uribe,… Beal-Uri… Khalilah 9990023 Stud…
#> 6 9990000024 John Fielding Fielding, J… Fielding John 9990024 Stud…
#> # ℹ 15 more variables: email_address <chr>, lec <dbl>, majors <chr>,
#> # terms_in_attendance <chr>, units <dbl>, grading_basis <chr>,
#> # waitlist_position <lgl>, dept <chr>, course <dbl>, section <dbl>,
#> # roster_dt_min <chr>, roster_dt_max <chr>, enrolled <lgl>, roster_dt <chr>,
#> # preferred_name <chr>
Generate summaries by course section:
# Create sections summary
<- make_sections_df(roster_df)
sections_df
# View sections
sections_df#> # A tibble: 1 × 4
#> dept course section n
#> <chr> <chr> <chr> <int>
#> 1 LTF 23 24 7
Create a simplified student list for analysis:
# Create simplified roster
<- make_roster_small(roster_df)
roster_small_df
# View simplified roster
head(roster_small_df)
#> # A tibble: 6 × 6
#> student_id first_last preferred_name dept course section
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 9990000019 Melissa Ko Melissa Ko LTF 23 24
#> 2 9990000020 Ryan Sloan Ryan Sloan LTF 23 24
#> 3 9990000021 Shreeharsh Kelkar Shreeharsh Kelkar LTF 23 24
#> 4 9990000022 Srijani Ghosh Srijani Ghosh LTF 23 24
#> 5 9990000023 Khalilah Beal-Uribe Khalilah Beal-Uribe LTF 23 24
#> 6 9990000024 John Fielding John Fielding LTF 23 24
First, let’s load some transcript data to work with:
# Create sample transcript metrics for demonstration
# In practice, you would use summarize_transcript_metrics() on actual transcript files
<- tibble::tibble(
transcripts_metrics_df name = c("Conor Healy", "Dr. Melissa Ko", "Ryan Sloan"),
n = c(15, 8, 12),
duration = c(45.2, 23.1, 38.7),
wordcount = c(1200, 650, 950),
comments = list("Good discussion", "Excellent points", "Interesting question"),
n_perc = c(25.0, 13.3, 20.0),
duration_perc = c(25.0, 13.3, 20.0),
wordcount_perc = c(25.0, 13.3, 20.0),
wpm = c(26.5, 28.1, 24.6),
course_section = "23.24",
course = 23,
section = 24,
day = "Thursday",
time = "18:30",
name_raw = name,
start_time_local = as.POSIXct("2024-01-24 18:30:00", tz = "America/Los_Angeles"),
dept = "LTF",
session_num = 1
)
# View transcript metrics
head(transcripts_metrics_df)
#> # A tibble: 3 × 18
#> name n duration wordcount comments n_perc duration_perc wordcount_perc
#> <chr> <dbl> <dbl> <dbl> <list> <dbl> <dbl> <dbl>
#> 1 Conor H… 15 45.2 1200 <chr> 25 25 25
#> 2 Dr. Mel… 8 23.1 650 <chr> 13.3 13.3 13.3
#> 3 Ryan Sl… 12 38.7 950 <chr> 20 20 20
#> # ℹ 10 more variables: wpm <dbl>, course_section <chr>, course <dbl>,
#> # section <dbl>, day <chr>, time <chr>, name_raw <chr>,
#> # start_time_local <dttm>, dept <chr>, session_num <dbl>
Create a roster with entries for each recorded session:
# Create a simplified session mapping for demonstration
# In practice, you would use the full data loading functions
<- tibble::tibble(
transcripts_list_df dept = "LTF",
course = 23,
section = 24,
session_num = 1,
start_time_local = as.POSIXct("2024-01-24 18:30:00", tz = "America/Los_Angeles"),
course_section = "23.24"
)
# Create roster sessions
<- make_student_roster_sessions(
roster_sessions
transcripts_list_df,
roster_small_df
)
# View roster sessions
head(roster_sessions)
#> # A tibble: 6 × 9
#> student_id first_last preferred_name dept course section session_num
#> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
#> 1 9990000019 Melissa Ko Melissa Ko LTF 23 24 1
#> 2 9990000020 Ryan Sloan Ryan Sloan LTF 23 24 1
#> 3 9990000021 Shreeharsh Kelkar Shreeharsh Ke… LTF 23 24 1
#> 4 9990000022 Srijani Ghosh Srijani Ghosh LTF 23 24 1
#> 5 9990000023 Khalilah Beal-Uribe Khalilah Beal… LTF 23 24 1
#> 6 9990000024 John Fielding John Fielding LTF 23 24 1
#> # ℹ 2 more variables: start_time_local <dttm>, course_section <chr>
Join transcript names with roster data:
# Create clean names dataframe
# Use a temporary directory since we're working with sample data
<- tempdir()
temp_dir <- make_clean_names_df(
clean_names_df data_folder = temp_dir,
section_names_lookup_file = "section_names_lookup.csv",
transcripts_metrics_df,
roster_sessions
)#> Warning in load_section_names_lookup(data_folder = data_folder,
#> names_lookup_file = section_names_lookup_file, : File does not exist:
#> /var/folders/gm/wnk5gljx6yd_ffmqb8vf48qh0000gn/T//Rtmpux9gKP/section_names_lookup.csv
#> Warning in load_section_names_lookup(data_folder = data_folder,
#> names_lookup_file = section_names_lookup_file, : Creating empty lookup table.
# View initial matching results
head(clean_names_df)
#> # A tibble: 3 × 22
#> preferred_name formal_name transcript_name student_id section course_section
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Conor Healy Conor Healy Conor Healy 9990000025 24 23.24
#> 2 Dr. Melissa Ko Dr. Melissa … Dr. Melissa Ko <NA> 24 23.24
#> 3 Ryan Sloan Ryan Sloan Ryan Sloan 9990000020 24 23.24
#> # ℹ 16 more variables: session_num <dbl>, n <dbl>, duration <dbl>,
#> # wordcount <dbl>, comments <list>, n_perc <dbl>, duration_perc <dbl>,
#> # wordcount_perc <dbl>, wpm <dbl>, name_raw <chr>, start_time_local <dttm>,
#> # time <chr>, day <chr>, course <chr>, dept <chr>, first_last <chr>
Find names that need manual cleaning:
# Find names that need cleaning
<- make_names_to_clean_df(clean_names_df)
names_to_clean
# View names needing attention
names_to_clean#> # A tibble: 1 × 4
#> student_id preferred_name transcript_name n
#> <chr> <chr> <chr> <dbl>
#> 1 <NA> Dr. Melissa Ko Dr. Melissa Ko 1
Create a lookup file for manual editing:
# Write section names lookup
write_section_names_lookup(
clean_names_df,data_folder = data_folder,
section_names_lookup_file = "section_names_lookup.csv"
)#> # A tibble: 3 × 9
#> course_section day time course section preferred_name formal_name
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 23.24 Thursday 18:30 23 24 Conor Healy Conor Healy
#> 2 23.24 Thursday 18:30 23 24 Dr. Melissa Ko Dr. Melissa Ko
#> 3 23.24 Thursday 18:30 23 24 Ryan Sloan Ryan Sloan
#> # ℹ 2 more variables: transcript_name <chr>, student_id <chr>
The name cleaning process involves:
Typical issues you might encounter:
The lookup file contains:
The package supports preferred names from the roster:
# Check if roster has preferred names
if ("preferred_name" %in% colnames(roster_df)) {
cat("Roster includes preferred names\n")
head(roster_df[, c("first_last", "preferred_name")])
else {
} cat("Roster uses formal names only\n")
}#> Roster includes preferred names
#> # A tibble: 6 × 2
#> first_last preferred_name
#> <chr> <chr>
#> 1 Melissa Ko Melissa Ko
#> 2 Ryan Sloan Ryan Sloan
#> 3 Shreeharsh Kelkar Shreeharsh Kelkar
#> 4 Srijani Ghosh Srijani Ghosh
#> 5 Khalilah Beal-Uribe Khalilah Beal-Uribe
#> 6 John Fielding John Fielding
For special cases like guest speakers:
# Example: Mark guest speakers
# In the lookup file, set:
# transcript_name: "Guest Speaker"
# preferred_name: "Guest"
# formal_name: "Guest Speaker"
# student_id: "GUEST001"
# section: "LTF.201.1"
Run initial matching:
<- make_clean_names_df(...) clean_names_df
Check for unmatched names:
<- make_names_to_clean_df(clean_names_df) names_to_clean
Edit the lookup file manually
Re-run matching with updated lookup
Repeat until all names are matched
Check your cleaning results:
# After cleaning, check results
<- make_clean_names_df(
final_clean_names data_folder = data_folder,
section_names_lookup_file = "section_names_lookup.csv",
transcripts_metrics_df,
roster_sessions
)
# Check for any remaining unmatched names
<- make_names_to_clean_df(final_clean_names)
remaining_unmatched if (nrow(remaining_unmatched) == 0) {
cat("All names successfully matched!\n")
else {
} cat("Still have", nrow(remaining_unmatched), "names to clean\n")
}#> Still have 3 names to clean