Transcript Processing with zoomstudentengagement

zoomstudentengagement package

2025-08-03

library(zoomstudentengagement)
library(dplyr)
library(ggplot2)

Transcript Processing

This vignette shows how to load, process, and analyze Zoom transcripts using the zoomstudentengagement package.

Loading Individual Transcripts

Basic Transcript Loading

Start by loading a raw Zoom transcript:

# Load a sample transcript
transcript_file <- system.file(
  "extdata/transcripts/GMT20240124-202901_Recording.transcript.vtt",
  package = "zoomstudentengagement"
)

# Load the raw transcript
raw_transcript <- load_zoom_transcript(transcript_file_path = transcript_file)

# View the structure
head(raw_transcript)
#> # A tibble: 6 × 8
#>   transcript_file        comment_num name  comment start     end        duration
#>   <chr>                  <chr>       <chr> <chr>   <time>    <time>     <drtn>  
#> 1 GMT20240124-202901_Re… 1           Srij… Hi!     00'00.05" 00'01.790"  1.740 …
#> 2 GMT20240124-202901_Re… 2           Cono… Get th… 00'02.07" 00'04.050"  1.980 …
#> 3 GMT20240124-202901_Re… 3           Srij… Hello!… 00'05.14" 00'08.310"  3.170 …
#> 4 GMT20240124-202901_Re… 4           Srij… So      00'09.71" 00'11.670"  1.960 …
#> 5 GMT20240124-202901_Re… 5           Cono… let's … 00'12.16" 00'14.309"  2.149 …
#> 6 GMT20240124-202901_Re… 6           Cono… Studen… 00'14.55" 00'30.599" 16.049 …
#> # ℹ 1 more variable: wordcount <int>

Processing Transcripts

Process the transcript to consolidate comments and add dead air:

# Process the transcript with options
processed_transcript <- process_zoom_transcript(
  transcript_file_path = transcript_file,
  consolidate_comments = TRUE,
  max_pause_sec = 1,
  add_dead_air = TRUE,
  dead_air_name = "dead_air",
  na_name = "unknown"
)

# View the processed transcript
head(processed_transcript)
#> # A tibble: 6 × 8
#>   name          comment   start     end       duration wordcount transcript_file
#>   <chr>         <chr>     <time>    <time>       <dbl>     <int> <chr>          
#> 1 dead_air      <NA>      00'00.00" 00'00.05"   0.0500        NA GMT20240124-20…
#> 2 Srijani Ghosh Hi!       00'00.05" 00'01.79"   1.74           1 GMT20240124-20…
#> 3 dead_air      <NA>      00'01.79" 00'02.07"   0.280         NA GMT20240124-20…
#> 4 Conor Healy   Get this… 00'02.07" 00'04.05"   1.98           4 GMT20240124-20…
#> 5 dead_air      <NA>      00'04.05" 00'05.14"   1.09          NA GMT20240124-20…
#> 6 Srijani Ghosh Hello! H… 00'05.14" 00'08.31"   3.17           8 GMT20240124-20…
#> # ℹ 1 more variable: comment_num <int>

Calculating Summary Metrics

Single Transcript Analysis

Calculate engagement metrics for a single transcript:

# Calculate summary metrics
summary_metrics <- summarize_transcript_metrics(
  transcript_file_path = transcript_file,
  names_exclude = c("dead_air"),
  consolidate_comments = TRUE,
  max_pause_sec = 1,
  add_dead_air = TRUE
)

# View the metrics
summary_metrics
#> # A tibble: 6 × 10
#>   transcript_file   name      n duration wordcount comments n_perc duration_perc
#>   <chr>             <chr> <int>    <dbl>     <dbl> <I<list>  <dbl>         <dbl>
#> 1 GMT20240124-2029… Cono…    30  485.         1418 <chr>     66.7         72.2  
#> 2 GMT20240124-2029… Srij…     8   69.1         213 <chr>     17.8         10.3  
#> 3 GMT20240124-2029… Shre…     3   43.3          86 <chr>      6.67         6.45 
#> 4 GMT20240124-2029… Dr. …     2   42.7          98 <chr>      4.44         6.36 
#> 5 GMT20240124-2029… Ryan…     1   31.3          95 <chr>      2.22         4.65 
#> 6 GMT20240124-2029… unkn…     1    0.680         1 <chr>      2.22         0.101
#> # ℹ 2 more variables: wordcount_perc <dbl>, wpm <dbl>

Multiple Transcript Analysis

For multiple transcripts, use the batch processing function:

# Set up data folder path
data_folder <- system.file("extdata", package = "zoomstudentengagement")

# Process multiple transcripts
batch_metrics <- summarize_transcript_files(
  transcript_file_names = "GMT20240124-202901_Recording.transcript.vtt",
  data_folder = data_folder,
  transcripts_folder = "transcripts",
  names_to_exclude = c("dead_air"),
  deduplicate_content = FALSE
)

# View batch results
head(batch_metrics)
#> # A tibble: 6 × 12
#>   name         n duration wordcount comments n_perc duration_perc wordcount_perc
#>   <chr>    <int>    <dbl>     <dbl> <I<list>  <dbl>         <dbl>          <dbl>
#> 1 Conor H…    30  485.         1418 <chr>     66.7         72.2          74.2   
#> 2 Srijani…     8   69.1         213 <chr>     17.8         10.3          11.1   
#> 3 Shreeha…     3   43.3          86 <chr>      6.67         6.45          4.50  
#> 4 Dr. Mel…     2   42.7          98 <chr>      4.44         6.36          5.13  
#> 5 Ryan Sl…     1   31.3          95 <chr>      2.22         4.65          4.97  
#> 6 unknown      1    0.680         1 <chr>      2.22         0.101         0.0523
#> # ℹ 4 more variables: wpm <dbl>, transcript_file <chr>, transcript_path <chr>,
#> #   name_raw <chr>

Working with Transcript Files

Loading Transcript File Lists

Load and organize multiple transcript files:

# Load transcript files list
transcript_files <- load_transcript_files_list(
  data_folder = data_folder,
  transcripts_folder = "transcripts"
)

# View the file list
transcript_files
#>   date_extract     recording_start    start_time_local
#> 1     20240124 2024-01-24 20:29:01 2024-01-24 12:29:01
#>                   closed_caption_file
#> 1 GMT20240124-202901_Recording.cc.vtt
#>                               transcript_file
#> 1 GMT20240124-202901_Recording.transcript.vtt
#>                                 chat_file
#> 1 GMT20240124-202901_RecordingnewChat.txt

Loading Zoom Recordings List

Load the list of Zoom recordings from CSV:

# Load Zoom recordings list
recordings_list <- load_zoom_recorded_sessions_list(
  data_folder = data_folder,
  transcripts_folder = "transcripts",
  dept = "LTF",
  semester_start_mdy = "Jan 01, 2024",
  scheduled_session_length_hours = 1.5
)
#> [1] "CSV files to process:"
#> [1] "zoomus_recordings__20240124.csv"
#> [1] "After reading CSV:"
#> # A tibble: 4 × 9
#>   filepath  Topic ID    `Start Time` `File Size (MB)` `File Count` `Total Views`
#>   <chr>     <chr> <chr> <chr>        <chr>                   <dbl>         <dbl>
#> 1 /private… LTF … 996 … Jan 24, 202… 147.0676                   11             0
#> 2 /private… LTF … 960 … Jan 18, 202… 1200.6551                  24             0
#> 3 /private… LTF … 960 … Jan 11, 202… 2317.0028                  36             0
#> 4 /private… Cono… 901 … Jan 03, 202… 2819.0706                   9             1
#> # ℹ 2 more variables: `Total Downloads` <dbl>, `Last Accessed` <chr>
#> [1] "After summarise:"
#>                                 Topic            ID            Start Time
#> 1 Conor Healy's Personal Meeting Room  901 075 7783 Jan 03, 2024 19:16:16
#> 2    LTF 23.24 - Thurs 6:30PM (Healy) 960 8322 8914 Jan 11, 2024 18:22:24
#> 3    LTF 23.24 - Thurs 6:30PM (Healy) 960 8322 8914 Jan 18, 2024 18:26:14
#> 4    LTF 23.24 - Thurs 6:30PM (Healy) 996 6354 4011 Jan 24, 2024 12:25:59
#>   File Size (MB) File Count Total Views Total Downloads         Last Accessed
#> 1      2819.0706          9           1               4 Jan 10, 2024 20:52:26
#> 2      2317.0028         36           0               6 Jan 18, 2024 11:43:46
#> 3      1200.6551         24           0               6 Jan 19, 2024 04:59:50
#> 4       147.0676         11           0               6 Jan 24, 2024 13:39:20
#> [1] "After topic parsing:"
#>                              Topic            ID            Start Time
#> 2 LTF 23.24 - Thurs 6:30PM (Healy) 960 8322 8914 Jan 11, 2024 18:22:24
#> 3 LTF 23.24 - Thurs 6:30PM (Healy) 960 8322 8914 Jan 18, 2024 18:26:14
#> 4 LTF 23.24 - Thurs 6:30PM (Healy) 996 6354 4011 Jan 24, 2024 12:25:59
#>   File Size (MB) File Count Total Views Total Downloads         Last Accessed
#> 2      2317.0028         36           0               6 Jan 18, 2024 11:43:46
#> 3      1200.6551         24           0               6 Jan 19, 2024 04:59:50
#> 4       147.0676         11           0               6 Jan 24, 2024 13:39:20
#>   dept course_section course section
#> 2  LTF          23.24     23      24
#> 3  LTF          23.24     23      24
#> 4  LTF          23.24     23      24
#> [1] "Start Time values:"
#> [1] "Jan 11, 2024 18:22:24" "Jan 18, 2024 18:26:14" "Jan 24, 2024 12:25:59"
#> [1] "After date parsing:"
#>                              Topic            ID            Start Time
#> 2 LTF 23.24 - Thurs 6:30PM (Healy) 960 8322 8914 Jan 11, 2024 18:22:24
#> 3 LTF 23.24 - Thurs 6:30PM (Healy) 960 8322 8914 Jan 18, 2024 18:26:14
#> 4 LTF 23.24 - Thurs 6:30PM (Healy) 996 6354 4011 Jan 24, 2024 12:25:59
#>   File Size (MB) File Count Total Views Total Downloads         Last Accessed
#> 2      2317.0028         36           0               6 Jan 18, 2024 11:43:46
#> 3      1200.6551         24           0               6 Jan 19, 2024 04:59:50
#> 4       147.0676         11           0               6 Jan 24, 2024 13:39:20
#>   dept course_section course section    match_start_time      match_end_time
#> 2  LTF          23.24     23      24 2024-01-11 18:22:24 2024-01-11 20:22:24
#> 3  LTF          23.24     23      24 2024-01-18 18:26:14 2024-01-18 20:26:14
#> 4  LTF          23.24     23      24 2024-01-24 12:25:59 2024-01-24 14:25:59
#> [1] "Final result after filtering:"
#>                              Topic            ID            Start Time
#> 2 LTF 23.24 - Thurs 6:30PM (Healy) 960 8322 8914 Jan 11, 2024 18:22:24
#> 3 LTF 23.24 - Thurs 6:30PM (Healy) 960 8322 8914 Jan 18, 2024 18:26:14
#> 4 LTF 23.24 - Thurs 6:30PM (Healy) 996 6354 4011 Jan 24, 2024 12:25:59
#>   File Size (MB) File Count Total Views Total Downloads         Last Accessed
#> 2      2317.0028         36           0               6 Jan 18, 2024 11:43:46
#> 3      1200.6551         24           0               6 Jan 19, 2024 04:59:50
#> 4       147.0676         11           0               6 Jan 24, 2024 13:39:20
#>   dept course_section course section    match_start_time      match_end_time
#> 2  LTF          23.24     23      24 2024-01-11 18:22:24 2024-01-11 20:22:24
#> 3  LTF          23.24     23      24 2024-01-18 18:26:14 2024-01-18 20:26:14
#> 4  LTF          23.24     23      24 2024-01-24 12:25:59 2024-01-24 14:25:59

# View the recordings list
recordings_list
#> # A tibble: 3 × 14
#>   Topic           ID    `Start Time` `File Size (MB)` `File Count` `Total Views`
#>   <chr>           <chr> <chr>                   <dbl>        <dbl>         <dbl>
#> 1 LTF 23.24 - Th… 960 … Jan 11, 202…            2317.           36             0
#> 2 LTF 23.24 - Th… 960 … Jan 18, 202…            1201.           24             0
#> 3 LTF 23.24 - Th… 996 … Jan 24, 202…             147.           11             0
#> # ℹ 8 more variables: `Total Downloads` <dbl>, `Last Accessed` <chr>,
#> #   dept <chr>, course_section <chr>, course <int>, section <int>,
#> #   match_start_time <dttm>, match_end_time <dttm>

Advanced Processing Options

Content Deduplication

Handle duplicate transcripts intelligently:

# Process with content deduplication
deduplicated_metrics <- summarize_transcript_files(
  transcript_file_names = "GMT20240124-202901_Recording.transcript.vtt",
  data_folder = data_folder,
  transcripts_folder = "transcripts",
  deduplicate_content = TRUE,
  similarity_threshold = 0.95,
  duplicate_method = "hybrid"
)

# View deduplicated results
head(deduplicated_metrics)
#> # A tibble: 6 × 12
#>   name         n duration wordcount comments n_perc duration_perc wordcount_perc
#>   <chr>    <int>    <dbl>     <dbl> <I<list>  <dbl>         <dbl>          <dbl>
#> 1 Conor H…    30    485.       1418 <chr>     33.3          65.9           74.2 
#> 2 Srijani…     8     69.1       213 <chr>      8.89          9.39          11.1 
#> 3 dead_air    45     63.6         0 <chr>     50             8.65           0   
#> 4 Shreeha…     3     43.3        86 <chr>      3.33          5.89           4.50
#> 5 Dr. Mel…     2     42.7        98 <chr>      2.22          5.81           5.13
#> 6 Ryan Sl…     1     31.3        95 <chr>      1.11          4.25           4.97
#> # ℹ 4 more variables: wpm <dbl>, transcript_file <chr>, transcript_path <chr>,
#> #   name_raw <chr>

Custom Processing Parameters

Fine-tune the processing parameters:

# Custom processing with specific parameters
custom_processed <- process_zoom_transcript(
  transcript_file_path = transcript_file,
  consolidate_comments = TRUE,
  max_pause_sec = 2, # Longer pause threshold
  add_dead_air = TRUE,
  dead_air_name = "silence",
  na_name = "unidentified"
)

# View custom processed transcript
head(custom_processed)
#> # A tibble: 6 × 8
#>   name          comment   start     end       duration wordcount transcript_file
#>   <chr>         <chr>     <time>    <time>       <dbl>     <int> <chr>          
#> 1 silence       <NA>      00'00.00" 00'00.05"   0.0500        NA GMT20240124-20…
#> 2 Srijani Ghosh Hi!       00'00.05" 00'01.79"   1.74           1 GMT20240124-20…
#> 3 silence       <NA>      00'01.79" 00'02.07"   0.280         NA GMT20240124-20…
#> 4 Conor Healy   Get this… 00'02.07" 00'04.05"   1.98           4 GMT20240124-20…
#> 5 silence       <NA>      00'04.05" 00'05.14"   1.09          NA GMT20240124-20…
#> 6 Srijani Ghosh Hello! H… 00'05.14" 00'11.67"   6.53           9 GMT20240124-20…
#> # ℹ 1 more variable: comment_num <int>

Understanding the Output

Transcript Structure

The processed transcript contains:

  • start_time: When the comment started
  • end_time: When the comment ended
  • duration: Length of the comment
  • name: Speaker name
  • comment: The actual text
  • wordcount: Number of words

Metrics Structure

The summary metrics include:

  • name: Speaker name
  • n: Number of comments
  • perc_n: Percentage of total comments
  • duration: Total speaking time
  • perc_duration: Percentage of total time
  • wordcount: Total words spoken
  • perc_wordcount: Percentage of total words
  • wpm: Words per minute

Next Steps