Roster Management and Name Cleaning

zoomstudentengagement package

2025-08-03

library(zoomstudentengagement)
library(dplyr)
library(ggplot2)

Roster Management and Name Cleaning

This vignette shows how to manage student rosters and clean name mismatches between Zoom transcripts and official enrollment data.

Loading Student Rosters

Basic Roster Loading

Load a student roster from a CSV file:

# Load the sample roster
data_folder <- system.file("extdata", package = "zoomstudentengagement")

roster_df <- load_roster(
  data_folder = data_folder,
  roster_file = "roster.csv"
)
#> Rows: 7 Columns: 22
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (14): first_last, last_first, last_name, first_name, role, email_address...
#> dbl  (6): student_id, user_id, lec, units, course, section
#> lgl  (2): waitlist_position, enrolled
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# View the roster structure
head(roster_df)
#> # A tibble: 6 × 22
#>   student_id first_last          last_first   last_name first_name user_id role 
#>        <dbl> <chr>               <chr>        <chr>     <chr>        <dbl> <chr>
#> 1 9990000019 Melissa Ko          Ko, Melissa  Ko        Melissa    9990019 Stud…
#> 2 9990000020 Ryan Sloan          Sloan, Ryan  Sloan     Ryan       9990020 Stud…
#> 3 9990000021 Shreeharsh Kelkar   Kelkar, Shr… Kelkar    Shreeharsh 9990021 Stud…
#> 4 9990000022 Srijani Ghosh       Ghosh, Srij… Ghosh     Srijani    9990022 Stud…
#> 5 9990000023 Khalilah Beal-Uribe Beal-Uribe,… Beal-Uri… Khalilah   9990023 Stud…
#> 6 9990000024 John Fielding       Fielding, J… Fielding  John       9990024 Stud…
#> # ℹ 15 more variables: email_address <chr>, lec <dbl>, majors <chr>,
#> #   terms_in_attendance <chr>, units <dbl>, grading_basis <chr>,
#> #   waitlist_position <lgl>, dept <chr>, course <dbl>, section <dbl>,
#> #   roster_dt_min <chr>, roster_dt_max <chr>, enrolled <lgl>, roster_dt <chr>,
#> #   preferred_name <chr>

Creating Section Summaries

Generate summaries by course section:

# Create sections summary
sections_df <- make_sections_df(roster_df)

# View sections
sections_df
#> # A tibble: 1 × 4
#>   dept  course section     n
#>   <chr> <chr>  <chr>   <int>
#> 1 LTF   23     24          7

Creating Student Lists

Create a simplified student list for analysis:

# Create simplified roster
roster_small_df <- make_roster_small(roster_df)

# View simplified roster
head(roster_small_df)
#> # A tibble: 6 × 6
#>   student_id first_last          preferred_name      dept  course section
#>   <chr>      <chr>               <chr>               <chr> <chr>  <chr>  
#> 1 9990000019 Melissa Ko          Melissa Ko          LTF   23     24     
#> 2 9990000020 Ryan Sloan          Ryan Sloan          LTF   23     24     
#> 3 9990000021 Shreeharsh Kelkar   Shreeharsh Kelkar   LTF   23     24     
#> 4 9990000022 Srijani Ghosh       Srijani Ghosh       LTF   23     24     
#> 5 9990000023 Khalilah Beal-Uribe Khalilah Beal-Uribe LTF   23     24     
#> 6 9990000024 John Fielding       John Fielding       LTF   23     24

Working with Transcript Data

Loading Transcript Metrics

First, let’s load some transcript data to work with:

# Create sample transcript metrics for demonstration
# In practice, you would use summarize_transcript_metrics() on actual transcript files
transcripts_metrics_df <- tibble::tibble(
  name = c("Conor Healy", "Dr. Melissa Ko", "Ryan Sloan"),
  n = c(15, 8, 12),
  duration = c(45.2, 23.1, 38.7),
  wordcount = c(1200, 650, 950),
  comments = list("Good discussion", "Excellent points", "Interesting question"),
  n_perc = c(25.0, 13.3, 20.0),
  duration_perc = c(25.0, 13.3, 20.0),
  wordcount_perc = c(25.0, 13.3, 20.0),
  wpm = c(26.5, 28.1, 24.6),
  course_section = "23.24",
  course = 23,
  section = 24,
  day = "Thursday",
  time = "18:30",
  name_raw = name,
  start_time_local = as.POSIXct("2024-01-24 18:30:00", tz = "America/Los_Angeles"),
  dept = "LTF",
  session_num = 1
)

# View transcript metrics
head(transcripts_metrics_df)
#> # A tibble: 3 × 18
#>   name         n duration wordcount comments n_perc duration_perc wordcount_perc
#>   <chr>    <dbl>    <dbl>     <dbl> <list>    <dbl>         <dbl>          <dbl>
#> 1 Conor H…    15     45.2      1200 <chr>      25            25             25  
#> 2 Dr. Mel…     8     23.1       650 <chr>      13.3          13.3           13.3
#> 3 Ryan Sl…    12     38.7       950 <chr>      20            20             20  
#> # ℹ 10 more variables: wpm <dbl>, course_section <chr>, course <dbl>,
#> #   section <dbl>, day <chr>, time <chr>, name_raw <chr>,
#> #   start_time_local <dttm>, dept <chr>, session_num <dbl>

Creating Session Rosters

Create a roster with entries for each recorded session:

# Create a simplified session mapping for demonstration
# In practice, you would use the full data loading functions
transcripts_list_df <- tibble::tibble(
  dept = "LTF",
  course = 23,
  section = 24,
  session_num = 1,
  start_time_local = as.POSIXct("2024-01-24 18:30:00", tz = "America/Los_Angeles"),
  course_section = "23.24"
)

# Create roster sessions
roster_sessions <- make_student_roster_sessions(
  transcripts_list_df,
  roster_small_df
)

# View roster sessions
head(roster_sessions)
#> # A tibble: 6 × 9
#>   student_id first_last          preferred_name dept  course section session_num
#>   <chr>      <chr>               <chr>          <chr> <chr>  <chr>         <dbl>
#> 1 9990000019 Melissa Ko          Melissa Ko     LTF   23     24                1
#> 2 9990000020 Ryan Sloan          Ryan Sloan     LTF   23     24                1
#> 3 9990000021 Shreeharsh Kelkar   Shreeharsh Ke… LTF   23     24                1
#> 4 9990000022 Srijani Ghosh       Srijani Ghosh  LTF   23     24                1
#> 5 9990000023 Khalilah Beal-Uribe Khalilah Beal… LTF   23     24                1
#> 6 9990000024 John Fielding       John Fielding  LTF   23     24                1
#> # ℹ 2 more variables: start_time_local <dttm>, course_section <chr>

Name Cleaning Process

Initial Name Matching

Join transcript names with roster data:

# Create clean names dataframe
# Use a temporary directory since we're working with sample data
temp_dir <- tempdir()
clean_names_df <- make_clean_names_df(
  data_folder = temp_dir,
  section_names_lookup_file = "section_names_lookup.csv",
  transcripts_metrics_df,
  roster_sessions
)
#> Warning in load_section_names_lookup(data_folder = data_folder,
#> names_lookup_file = section_names_lookup_file, : File does not exist:
#> /var/folders/gm/wnk5gljx6yd_ffmqb8vf48qh0000gn/T//Rtmpux9gKP/section_names_lookup.csv
#> Warning in load_section_names_lookup(data_folder = data_folder,
#> names_lookup_file = section_names_lookup_file, : Creating empty lookup table.

# View initial matching results
head(clean_names_df)
#> # A tibble: 3 × 22
#>   preferred_name formal_name   transcript_name student_id section course_section
#>   <chr>          <chr>         <chr>           <chr>      <chr>   <chr>         
#> 1 Conor Healy    Conor Healy   Conor Healy     9990000025 24      23.24         
#> 2 Dr. Melissa Ko Dr. Melissa … Dr. Melissa Ko  <NA>       24      23.24         
#> 3 Ryan Sloan     Ryan Sloan    Ryan Sloan      9990000020 24      23.24         
#> # ℹ 16 more variables: session_num <dbl>, n <dbl>, duration <dbl>,
#> #   wordcount <dbl>, comments <list>, n_perc <dbl>, duration_perc <dbl>,
#> #   wordcount_perc <dbl>, wpm <dbl>, name_raw <chr>, start_time_local <dttm>,
#> #   time <chr>, day <chr>, course <chr>, dept <chr>, first_last <chr>

Identifying Names to Clean

Find names that need manual cleaning:

# Find names that need cleaning
names_to_clean <- make_names_to_clean_df(clean_names_df)

# View names needing attention
names_to_clean
#> # A tibble: 1 × 4
#>   student_id preferred_name transcript_name     n
#>   <chr>      <chr>          <chr>           <dbl>
#> 1 <NA>       Dr. Melissa Ko Dr. Melissa Ko      1

Writing Names Lookup

Create a lookup file for manual editing:

# Write section names lookup
write_section_names_lookup(
  clean_names_df,
  data_folder = data_folder,
  section_names_lookup_file = "section_names_lookup.csv"
)
#> # A tibble: 3 × 9
#>   course_section day      time  course section preferred_name formal_name   
#>   <chr>          <chr>    <chr> <chr>  <chr>   <chr>          <chr>         
#> 1 23.24          Thursday 18:30 23     24      Conor Healy    Conor Healy   
#> 2 23.24          Thursday 18:30 23     24      Dr. Melissa Ko Dr. Melissa Ko
#> 3 23.24          Thursday 18:30 23     24      Ryan Sloan     Ryan Sloan    
#> # ℹ 2 more variables: transcript_name <chr>, student_id <chr>

Manual Name Cleaning

Understanding the Process

The name cleaning process involves:

  1. Automatic Matching: The package attempts to match transcript names to roster names
  2. Manual Review: Names that don’t match automatically are flagged for review
  3. Lookup File: A CSV file is created for manual editing
  4. Iterative Process: Repeat until all names are properly matched

Common Name Issues

Typical issues you might encounter:

  • Nicknames: “Mike” vs “Michael”
  • Display Names: “Prof. Smith” vs “Dr. Smith”
  • Guest Speakers: Names not in the roster
  • Technical Issues: “Unknown” or “User123”

Editing the Lookup File

The lookup file contains:

  • transcript_name: Name as it appears in the transcript
  • preferred_name: How you want it to appear in analysis
  • formal_name: Official name from roster
  • student_id: Student identifier
  • section: Course section

Advanced Name Cleaning

Using Preferred Names

The package supports preferred names from the roster:

# Check if roster has preferred names
if ("preferred_name" %in% colnames(roster_df)) {
  cat("Roster includes preferred names\n")
  head(roster_df[, c("first_last", "preferred_name")])
} else {
  cat("Roster uses formal names only\n")
}
#> Roster includes preferred names
#> # A tibble: 6 × 2
#>   first_last          preferred_name     
#>   <chr>               <chr>              
#> 1 Melissa Ko          Melissa Ko         
#> 2 Ryan Sloan          Ryan Sloan         
#> 3 Shreeharsh Kelkar   Shreeharsh Kelkar  
#> 4 Srijani Ghosh       Srijani Ghosh      
#> 5 Khalilah Beal-Uribe Khalilah Beal-Uribe
#> 6 John Fielding       John Fielding

Handling Special Cases

For special cases like guest speakers:

# Example: Mark guest speakers
# In the lookup file, set:
# transcript_name: "Guest Speaker"
# preferred_name: "Guest"
# formal_name: "Guest Speaker"
# student_id: "GUEST001"
# section: "LTF.201.1"

Iterative Cleaning Process

Step-by-Step Workflow

  1. Run initial matching:

    clean_names_df <- make_clean_names_df(...)
  2. Check for unmatched names:

    names_to_clean <- make_names_to_clean_df(clean_names_df)
  3. Edit the lookup file manually

  4. Re-run matching with updated lookup

  5. Repeat until all names are matched

Validation

Check your cleaning results:

# After cleaning, check results
final_clean_names <- make_clean_names_df(
  data_folder = data_folder,
  section_names_lookup_file = "section_names_lookup.csv",
  transcripts_metrics_df,
  roster_sessions
)

# Check for any remaining unmatched names
remaining_unmatched <- make_names_to_clean_df(final_clean_names)
if (nrow(remaining_unmatched) == 0) {
  cat("All names successfully matched!\n")
} else {
  cat("Still have", nrow(remaining_unmatched), "names to clean\n")
}
#> Still have 3 names to clean

Next Steps