From 11f942dfb9c1af54ade05f793d48480ff3cfc4ae Mon Sep 17 00:00:00 2001 From: Adrian Petrescu Date: Mon, 7 Nov 2022 15:34:15 -0500 Subject: [PATCH] Add support for WebVTT timeframes in MS Teams' non-compliant format --- tests/subtitles/teams_timeframe.vtt | 7 +++++++ tests/test_webvtt_parser.py | 7 +++++++ webvtt/parsers.py | 2 +- webvtt/structures.py | 2 +- 4 files changed, 16 insertions(+), 2 deletions(-) create mode 100644 tests/subtitles/teams_timeframe.vtt diff --git a/tests/subtitles/teams_timeframe.vtt b/tests/subtitles/teams_timeframe.vtt new file mode 100644 index 0000000..03f63a6 --- /dev/null +++ b/tests/subtitles/teams_timeframe.vtt @@ -0,0 +1,7 @@ +WEBVTT + +0:0:0.0 --> 0:0:0.570 +There you go. + +0:0:0.210 --> 0:0:1.130 +Why hello there, Yuri. diff --git a/tests/test_webvtt_parser.py b/tests/test_webvtt_parser.py index a26741a..fac638c 100644 --- a/tests/test_webvtt_parser.py +++ b/tests/test_webvtt_parser.py @@ -90,6 +90,13 @@ def test_webvtt_timestamps_format(self): self.assertEqual(vtt.captions[2].start, '00:00:11.890') self.assertEqual(vtt.captions[2].end, '00:00:16.320') + def test_webvtt_microsoft_teams_timestamps_format(self): + vtt = webvtt.read(self._get_file('teams_timeframe.vtt')) + self.assertEqual(vtt.captions[0].start, '00:00:00.000') + self.assertEqual(vtt.captions[0].end, '00:00:00.570') + self.assertEqual(vtt.captions[1].start, '00:00:00.210') + self.assertEqual(vtt.captions[1].end, '00:00:01.130') + def test_parse_timestamp(self): caption = Caption(start='02:03:11.890') self.assertEqual( diff --git a/webvtt/parsers.py b/webvtt/parsers.py index 3a978ca..e95a654 100644 --- a/webvtt/parsers.py +++ b/webvtt/parsers.py @@ -155,7 +155,7 @@ class WebVTTParser(TextBasedParser): WebVTT parser. """ - TIMEFRAME_LINE_PATTERN = re.compile(r'\s*((?:\d+:)?\d{2}:\d{2}.\d{3})\s*-->\s*((?:\d+:)?\d{2}:\d{2}.\d{3})') + TIMEFRAME_LINE_PATTERN = re.compile(r'\s*((?:\d+:)?\d{1,2}:\d{1,2}.\d{1,3})\s*-->\s*((?:\d+:)?\d{1,2}:\d{1,2}.\d{1,3})') COMMENT_PATTERN = re.compile(r'NOTE(?:\s.+|$)') STYLE_PATTERN = re.compile(r'STYLE[ \t]*$') diff --git a/webvtt/structures.py b/webvtt/structures.py index 84f376d..4fa04d8 100644 --- a/webvtt/structures.py +++ b/webvtt/structures.py @@ -2,7 +2,7 @@ from .errors import MalformedCaptionError -TIMESTAMP_PATTERN = re.compile('(\d+)?:?(\d{2}):(\d{2})[.,](\d{3})') +TIMESTAMP_PATTERN = re.compile('(\d+)?:?(\d{1,2}):(\d{1,2})[.,](\d{1,3})') __all__ = ['Caption']