library(zoomstudentengagement)
library(dplyr)
library(ggplot2)
This vignette shows how to load, process, and analyze Zoom
transcripts using the zoomstudentengagement
package.
Start by loading a raw Zoom transcript:
# Load a sample transcript
<- system.file(
transcript_file "extdata/transcripts/GMT20240124-202901_Recording.transcript.vtt",
package = "zoomstudentengagement"
)
# Load the raw transcript
<- load_zoom_transcript(transcript_file_path = transcript_file)
raw_transcript
# View the structure
head(raw_transcript)
#> # A tibble: 6 × 8
#> transcript_file comment_num name comment start end duration
#> <chr> <chr> <chr> <chr> <time> <time> <drtn>
#> 1 GMT20240124-202901_Re… 1 Srij… Hi! 00'00.05" 00'01.790" 1.740 …
#> 2 GMT20240124-202901_Re… 2 Cono… Get th… 00'02.07" 00'04.050" 1.980 …
#> 3 GMT20240124-202901_Re… 3 Srij… Hello!… 00'05.14" 00'08.310" 3.170 …
#> 4 GMT20240124-202901_Re… 4 Srij… So 00'09.71" 00'11.670" 1.960 …
#> 5 GMT20240124-202901_Re… 5 Cono… let's … 00'12.16" 00'14.309" 2.149 …
#> 6 GMT20240124-202901_Re… 6 Cono… Studen… 00'14.55" 00'30.599" 16.049 …
#> # ℹ 1 more variable: wordcount <int>
Process the transcript to consolidate comments and add dead air:
# Process the transcript with options
<- process_zoom_transcript(
processed_transcript transcript_file_path = transcript_file,
consolidate_comments = TRUE,
max_pause_sec = 1,
add_dead_air = TRUE,
dead_air_name = "dead_air",
na_name = "unknown"
)
# View the processed transcript
head(processed_transcript)
#> # A tibble: 6 × 8
#> name comment start end duration wordcount transcript_file
#> <chr> <chr> <time> <time> <dbl> <int> <chr>
#> 1 dead_air <NA> 00'00.00" 00'00.05" 0.0500 NA GMT20240124-20…
#> 2 Srijani Ghosh Hi! 00'00.05" 00'01.79" 1.74 1 GMT20240124-20…
#> 3 dead_air <NA> 00'01.79" 00'02.07" 0.280 NA GMT20240124-20…
#> 4 Conor Healy Get this… 00'02.07" 00'04.05" 1.98 4 GMT20240124-20…
#> 5 dead_air <NA> 00'04.05" 00'05.14" 1.09 NA GMT20240124-20…
#> 6 Srijani Ghosh Hello! H… 00'05.14" 00'08.31" 3.17 8 GMT20240124-20…
#> # ℹ 1 more variable: comment_num <int>
Calculate engagement metrics for a single transcript:
# Calculate summary metrics
<- summarize_transcript_metrics(
summary_metrics transcript_file_path = transcript_file,
names_exclude = c("dead_air"),
consolidate_comments = TRUE,
max_pause_sec = 1,
add_dead_air = TRUE
)
# View the metrics
summary_metrics#> # A tibble: 6 × 10
#> transcript_file name n duration wordcount comments n_perc duration_perc
#> <chr> <chr> <int> <dbl> <dbl> <I<list> <dbl> <dbl>
#> 1 GMT20240124-2029… Cono… 30 485. 1418 <chr> 66.7 72.2
#> 2 GMT20240124-2029… Srij… 8 69.1 213 <chr> 17.8 10.3
#> 3 GMT20240124-2029… Shre… 3 43.3 86 <chr> 6.67 6.45
#> 4 GMT20240124-2029… Dr. … 2 42.7 98 <chr> 4.44 6.36
#> 5 GMT20240124-2029… Ryan… 1 31.3 95 <chr> 2.22 4.65
#> 6 GMT20240124-2029… unkn… 1 0.680 1 <chr> 2.22 0.101
#> # ℹ 2 more variables: wordcount_perc <dbl>, wpm <dbl>
For multiple transcripts, use the batch processing function:
# Set up data folder path
<- system.file("extdata", package = "zoomstudentengagement")
data_folder
# Process multiple transcripts
<- summarize_transcript_files(
batch_metrics transcript_file_names = "GMT20240124-202901_Recording.transcript.vtt",
data_folder = data_folder,
transcripts_folder = "transcripts",
names_to_exclude = c("dead_air"),
deduplicate_content = FALSE
)
# View batch results
head(batch_metrics)
#> # A tibble: 6 × 12
#> name n duration wordcount comments n_perc duration_perc wordcount_perc
#> <chr> <int> <dbl> <dbl> <I<list> <dbl> <dbl> <dbl>
#> 1 Conor H… 30 485. 1418 <chr> 66.7 72.2 74.2
#> 2 Srijani… 8 69.1 213 <chr> 17.8 10.3 11.1
#> 3 Shreeha… 3 43.3 86 <chr> 6.67 6.45 4.50
#> 4 Dr. Mel… 2 42.7 98 <chr> 4.44 6.36 5.13
#> 5 Ryan Sl… 1 31.3 95 <chr> 2.22 4.65 4.97
#> 6 unknown 1 0.680 1 <chr> 2.22 0.101 0.0523
#> # ℹ 4 more variables: wpm <dbl>, transcript_file <chr>, transcript_path <chr>,
#> # name_raw <chr>
Load and organize multiple transcript files:
# Load transcript files list
<- load_transcript_files_list(
transcript_files data_folder = data_folder,
transcripts_folder = "transcripts"
)
# View the file list
transcript_files#> date_extract recording_start start_time_local
#> 1 20240124 2024-01-24 20:29:01 2024-01-24 12:29:01
#> closed_caption_file
#> 1 GMT20240124-202901_Recording.cc.vtt
#> transcript_file
#> 1 GMT20240124-202901_Recording.transcript.vtt
#> chat_file
#> 1 GMT20240124-202901_RecordingnewChat.txt
Load the list of Zoom recordings from CSV:
# Load Zoom recordings list
<- load_zoom_recorded_sessions_list(
recordings_list data_folder = data_folder,
transcripts_folder = "transcripts",
dept = "LTF",
semester_start_mdy = "Jan 01, 2024",
scheduled_session_length_hours = 1.5
)#> [1] "CSV files to process:"
#> [1] "zoomus_recordings__20240124.csv"
#> [1] "After reading CSV:"
#> # A tibble: 4 × 9
#> filepath Topic ID `Start Time` `File Size (MB)` `File Count` `Total Views`
#> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
#> 1 /private… LTF … 996 … Jan 24, 202… 147.0676 11 0
#> 2 /private… LTF … 960 … Jan 18, 202… 1200.6551 24 0
#> 3 /private… LTF … 960 … Jan 11, 202… 2317.0028 36 0
#> 4 /private… Cono… 901 … Jan 03, 202… 2819.0706 9 1
#> # ℹ 2 more variables: `Total Downloads` <dbl>, `Last Accessed` <chr>
#> [1] "After summarise:"
#> Topic ID Start Time
#> 1 Conor Healy's Personal Meeting Room 901 075 7783 Jan 03, 2024 19:16:16
#> 2 LTF 23.24 - Thurs 6:30PM (Healy) 960 8322 8914 Jan 11, 2024 18:22:24
#> 3 LTF 23.24 - Thurs 6:30PM (Healy) 960 8322 8914 Jan 18, 2024 18:26:14
#> 4 LTF 23.24 - Thurs 6:30PM (Healy) 996 6354 4011 Jan 24, 2024 12:25:59
#> File Size (MB) File Count Total Views Total Downloads Last Accessed
#> 1 2819.0706 9 1 4 Jan 10, 2024 20:52:26
#> 2 2317.0028 36 0 6 Jan 18, 2024 11:43:46
#> 3 1200.6551 24 0 6 Jan 19, 2024 04:59:50
#> 4 147.0676 11 0 6 Jan 24, 2024 13:39:20
#> [1] "After topic parsing:"
#> Topic ID Start Time
#> 2 LTF 23.24 - Thurs 6:30PM (Healy) 960 8322 8914 Jan 11, 2024 18:22:24
#> 3 LTF 23.24 - Thurs 6:30PM (Healy) 960 8322 8914 Jan 18, 2024 18:26:14
#> 4 LTF 23.24 - Thurs 6:30PM (Healy) 996 6354 4011 Jan 24, 2024 12:25:59
#> File Size (MB) File Count Total Views Total Downloads Last Accessed
#> 2 2317.0028 36 0 6 Jan 18, 2024 11:43:46
#> 3 1200.6551 24 0 6 Jan 19, 2024 04:59:50
#> 4 147.0676 11 0 6 Jan 24, 2024 13:39:20
#> dept course_section course section
#> 2 LTF 23.24 23 24
#> 3 LTF 23.24 23 24
#> 4 LTF 23.24 23 24
#> [1] "Start Time values:"
#> [1] "Jan 11, 2024 18:22:24" "Jan 18, 2024 18:26:14" "Jan 24, 2024 12:25:59"
#> [1] "After date parsing:"
#> Topic ID Start Time
#> 2 LTF 23.24 - Thurs 6:30PM (Healy) 960 8322 8914 Jan 11, 2024 18:22:24
#> 3 LTF 23.24 - Thurs 6:30PM (Healy) 960 8322 8914 Jan 18, 2024 18:26:14
#> 4 LTF 23.24 - Thurs 6:30PM (Healy) 996 6354 4011 Jan 24, 2024 12:25:59
#> File Size (MB) File Count Total Views Total Downloads Last Accessed
#> 2 2317.0028 36 0 6 Jan 18, 2024 11:43:46
#> 3 1200.6551 24 0 6 Jan 19, 2024 04:59:50
#> 4 147.0676 11 0 6 Jan 24, 2024 13:39:20
#> dept course_section course section match_start_time match_end_time
#> 2 LTF 23.24 23 24 2024-01-11 18:22:24 2024-01-11 20:22:24
#> 3 LTF 23.24 23 24 2024-01-18 18:26:14 2024-01-18 20:26:14
#> 4 LTF 23.24 23 24 2024-01-24 12:25:59 2024-01-24 14:25:59
#> [1] "Final result after filtering:"
#> Topic ID Start Time
#> 2 LTF 23.24 - Thurs 6:30PM (Healy) 960 8322 8914 Jan 11, 2024 18:22:24
#> 3 LTF 23.24 - Thurs 6:30PM (Healy) 960 8322 8914 Jan 18, 2024 18:26:14
#> 4 LTF 23.24 - Thurs 6:30PM (Healy) 996 6354 4011 Jan 24, 2024 12:25:59
#> File Size (MB) File Count Total Views Total Downloads Last Accessed
#> 2 2317.0028 36 0 6 Jan 18, 2024 11:43:46
#> 3 1200.6551 24 0 6 Jan 19, 2024 04:59:50
#> 4 147.0676 11 0 6 Jan 24, 2024 13:39:20
#> dept course_section course section match_start_time match_end_time
#> 2 LTF 23.24 23 24 2024-01-11 18:22:24 2024-01-11 20:22:24
#> 3 LTF 23.24 23 24 2024-01-18 18:26:14 2024-01-18 20:26:14
#> 4 LTF 23.24 23 24 2024-01-24 12:25:59 2024-01-24 14:25:59
# View the recordings list
recordings_list#> # A tibble: 3 × 14
#> Topic ID `Start Time` `File Size (MB)` `File Count` `Total Views`
#> <chr> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 LTF 23.24 - Th… 960 … Jan 11, 202… 2317. 36 0
#> 2 LTF 23.24 - Th… 960 … Jan 18, 202… 1201. 24 0
#> 3 LTF 23.24 - Th… 996 … Jan 24, 202… 147. 11 0
#> # ℹ 8 more variables: `Total Downloads` <dbl>, `Last Accessed` <chr>,
#> # dept <chr>, course_section <chr>, course <int>, section <int>,
#> # match_start_time <dttm>, match_end_time <dttm>
Handle duplicate transcripts intelligently:
# Process with content deduplication
<- summarize_transcript_files(
deduplicated_metrics transcript_file_names = "GMT20240124-202901_Recording.transcript.vtt",
data_folder = data_folder,
transcripts_folder = "transcripts",
deduplicate_content = TRUE,
similarity_threshold = 0.95,
duplicate_method = "hybrid"
)
# View deduplicated results
head(deduplicated_metrics)
#> # A tibble: 6 × 12
#> name n duration wordcount comments n_perc duration_perc wordcount_perc
#> <chr> <int> <dbl> <dbl> <I<list> <dbl> <dbl> <dbl>
#> 1 Conor H… 30 485. 1418 <chr> 33.3 65.9 74.2
#> 2 Srijani… 8 69.1 213 <chr> 8.89 9.39 11.1
#> 3 dead_air 45 63.6 0 <chr> 50 8.65 0
#> 4 Shreeha… 3 43.3 86 <chr> 3.33 5.89 4.50
#> 5 Dr. Mel… 2 42.7 98 <chr> 2.22 5.81 5.13
#> 6 Ryan Sl… 1 31.3 95 <chr> 1.11 4.25 4.97
#> # ℹ 4 more variables: wpm <dbl>, transcript_file <chr>, transcript_path <chr>,
#> # name_raw <chr>
Fine-tune the processing parameters:
# Custom processing with specific parameters
<- process_zoom_transcript(
custom_processed transcript_file_path = transcript_file,
consolidate_comments = TRUE,
max_pause_sec = 2, # Longer pause threshold
add_dead_air = TRUE,
dead_air_name = "silence",
na_name = "unidentified"
)
# View custom processed transcript
head(custom_processed)
#> # A tibble: 6 × 8
#> name comment start end duration wordcount transcript_file
#> <chr> <chr> <time> <time> <dbl> <int> <chr>
#> 1 silence <NA> 00'00.00" 00'00.05" 0.0500 NA GMT20240124-20…
#> 2 Srijani Ghosh Hi! 00'00.05" 00'01.79" 1.74 1 GMT20240124-20…
#> 3 silence <NA> 00'01.79" 00'02.07" 0.280 NA GMT20240124-20…
#> 4 Conor Healy Get this… 00'02.07" 00'04.05" 1.98 4 GMT20240124-20…
#> 5 silence <NA> 00'04.05" 00'05.14" 1.09 NA GMT20240124-20…
#> 6 Srijani Ghosh Hello! H… 00'05.14" 00'11.67" 6.53 9 GMT20240124-20…
#> # ℹ 1 more variable: comment_num <int>
The processed transcript contains:
The summary metrics include: