summaryrefslogtreecommitdiff
path: root/gst
diff options
context:
space:
mode:
authorJan Schmidt <jan@centricular.com>2016-03-07 23:29:43 +1100
committerJan Schmidt <jan@centricular.com>2016-03-25 00:58:42 +1100
commitfd2a14144a7ae7a15c39310bc748a9bda5321a70 (patch)
treeb5b5bb7459c67bc64e38c974ce9607807af24fb8 /gst
parentecb8d2e023762477f1284d8032309457b6b20be8 (diff)
subparse: WebVTT parsing support
WebVTT is a new subtitle format for HTML5 video. In this first version of the parser the cue settings are parsed but only stored in the internal parser state structure. Later on these settings could be part of the GstBuffer metadata. https://bugzilla.gnome.org/show_bug.cgi?id=629764
Diffstat (limited to 'gst')
-rw-r--r--gst/subparse/gstsubparse.c317
-rw-r--r--gst/subparse/gstsubparse.h11
2 files changed, 258 insertions, 70 deletions
diff --git a/gst/subparse/gstsubparse.c b/gst/subparse/gstsubparse.c
index d59df57a9..c4c4b47b3 100644
--- a/gst/subparse/gstsubparse.c
+++ b/gst/subparse/gstsubparse.c
@@ -2,6 +2,8 @@
* Copyright (C) <1999> Erik Walthinsen <omega@cse.ogi.edu>
* Copyright (C) 2004 Ronald S. Bultje <rbultje@ronald.bitfreak.net>
* Copyright (C) 2006 Tim-Philipp Müller <tim centricular net>
+ * Copyright (C) 2016 Philippe Normand <pnormand@igalia.com>
+ * Copyright (C) 2016 Jan Schmidt <jan@centricular.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
@@ -39,6 +41,10 @@
GST_DEBUG_CATEGORY (sub_parse_debug);
#define DEFAULT_ENCODING NULL
+#define ATTRIBUTE_REGEX "\\s?[a-zA-Z0-9\\. \t\\(\\)]*"
+static const gchar *allowed_srt_tags[] = { "i", "b", "u", NULL };
+static const gchar *allowed_vtt_tags[] =
+ { "i", "b", "c", "u", "v", "ruby", "rt", NULL };
enum
{
@@ -61,7 +67,7 @@ static GstStaticPadTemplate sink_templ = GST_STATIC_PAD_TEMPLATE ("sink",
GST_STATIC_CAPS ("application/x-subtitle; application/x-subtitle-sami; "
"application/x-subtitle-tmplayer; application/x-subtitle-mpl2; "
"application/x-subtitle-dks; application/x-subtitle-qttext;"
- "application/x-subtitle-lrc;")
+ "application/x-subtitle-lrc; application/x-subtitle-vtt")
);
static GstStaticPadTemplate src_templ = GST_STATIC_PAD_TEMPLATE ("src",
@@ -370,6 +376,8 @@ gst_sub_parse_get_format_description (GstSubParseFormat format)
return "SubViewer";
case GST_SUB_PARSE_FORMAT_DKS:
return "DKS";
+ case GST_SUB_PARSE_FORMAT_VTT:
+ return "WebVTT";
case GST_SUB_PARSE_FORMAT_QTTEXT:
return "QTtext";
case GST_SUB_PARSE_FORMAT_LRC:
@@ -663,45 +671,42 @@ strip_trailing_newlines (gchar * txt)
* escaping everything (the text between these simple markers isn't
* necessarily escaped, so it seems best to do it like this) */
static void
-subrip_unescape_formatting (gchar * txt)
+subrip_unescape_formatting (gchar * txt, gconstpointer allowed_tags_ptr,
+ gboolean allows_tag_attributes)
{
- gchar *pos;
+ gchar *res;
+ GRegex *tag_regex;
+ gchar *allowed_tags_pattern, *search_pattern;
+ const gchar *replace_pattern;
- for (pos = txt; pos != NULL && *pos != '\0'; ++pos) {
- if (g_ascii_strncasecmp (pos, "&lt;u&gt;", 9) == 0 ||
- g_ascii_strncasecmp (pos, "&lt;i&gt;", 9) == 0 ||
- g_ascii_strncasecmp (pos, "&lt;b&gt;", 9) == 0) {
- pos[0] = '<';
- pos[1] = g_ascii_tolower (pos[4]);
- pos[2] = '>';
- /* move NUL terminator as well */
- memmove (pos + 3, pos + 9, strlen (pos + 9) + 1);
- pos += 2;
- }
+ /* No processing needed if no escaped tag marker found in the string. */
+ if (strstr (txt, "&lt;") == NULL)
+ return;
+
+ /* Build a list of alternates for our regexp.
+ * FIXME: Could be built once and stored */
+ allowed_tags_pattern = g_strjoinv ("|", (gchar **) allowed_tags_ptr);
+ /* Look for starting/ending escaped tags with optional attributes. */
+ search_pattern = g_strdup_printf ("&lt;(/)?\\ *(%s)(%s)&gt;",
+ allowed_tags_pattern, ATTRIBUTE_REGEX);
+ /* And unescape appropriately */
+ if (allows_tag_attributes) {
+ replace_pattern = "<\\1\\2\\3>";
+ } else {
+ replace_pattern = "<\\1\\2>";
}
- for (pos = txt; pos != NULL && *pos != '\0'; ++pos) {
- gchar *tag;
+ tag_regex = g_regex_new (search_pattern, 0, 0, NULL);
+ res = g_regex_replace (tag_regex, txt, strlen (txt), 0,
+ replace_pattern, 0, NULL);
- /* look for start of an escaped closing tag */
- if (g_ascii_strncasecmp (pos, "&lt;/", 5) != 0)
- continue;
- tag = pos + 5;
- while (*tag == ' ')
- ++tag;
- if ((*tag == 'u' || *tag == 'i' || *tag == 'b') &&
- g_ascii_strncasecmp (tag + 1, "&gt;", 4) == 0) {
- gsize tag_len = (guintptr) (tag + 1 + 4 - pos);
-
- pos[0] = '<';
- pos[1] = '/';
- pos[2] = g_ascii_tolower (*tag);
- pos[3] = '>';
- /* move NUL terminator as well */
- memmove (pos + 4, pos + tag_len, strlen (pos + tag_len) + 1);
- pos += 3;
- }
- }
+ /* res will always be shorter than the input or identical, so this
+ * copy is OK */
+ strcpy (txt, res);
+
+ g_free (res);
+ g_free (search_pattern);
+ g_free (allowed_tags_pattern);
}
@@ -740,16 +745,25 @@ subrip_remove_unhandled_tags (gchar * txt)
}
}
-/* we only allow <i>, <u> and <b>, so let's take a simple approach. This code
- * assumes the input has been escaped and subrip_unescape_formatting() has then
- * been run over the input! This function adds missing closing markup tags and
- * removes broken closing tags for tags that have never been opened. */
+/* we only allow a fixed set of tags like <i>, <u> and <b>, so let's
+ * take a simple approach. This code assumes the input has been
+ * escaped and subrip_unescape_formatting() has then been run over the
+ * input! This function adds missing closing markup tags and removes
+ * broken closing tags for tags that have never been opened. */
static void
-subrip_fix_up_markup (gchar ** p_txt)
+subrip_fix_up_markup (gchar ** p_txt, gconstpointer allowed_tags_ptr)
{
gchar *cur, *next_tag;
- gchar open_tags[32];
+ gchar *open_tags[32];
guint num_open_tags = 0;
+ const gchar *iter_tag;
+ guint offset = 0;
+ guint index;
+ gchar *cur_tag;
+ gchar *end_tag;
+ GRegex *tag_regex;
+ GMatchInfo *match_info;
+ gchar **allowed_tags = (gchar **) allowed_tags_ptr;
g_assert (*p_txt != NULL);
@@ -758,33 +772,56 @@ subrip_fix_up_markup (gchar ** p_txt)
next_tag = strchr (cur, '<');
if (next_tag == NULL)
break;
- ++next_tag;
- switch (*next_tag) {
- case '/':{
- ++next_tag;
- if (num_open_tags == 0 || open_tags[num_open_tags - 1] != *next_tag) {
- GST_LOG ("broken input, closing tag '%c' is not open", *next_tag);
- memmove (next_tag - 2, next_tag + 2, strlen (next_tag + 2) + 1);
- next_tag -= 2;
- } else {
- /* it's all good, closing tag which is open */
- --num_open_tags;
+ offset = 0;
+ index = 0;
+ while (index < g_strv_length (allowed_tags)) {
+ iter_tag = allowed_tags[index];
+ /* Look for a white listed tag */
+ cur_tag = g_strconcat ("<", iter_tag, ATTRIBUTE_REGEX, ">", NULL);
+ tag_regex = g_regex_new (cur_tag, 0, 0, NULL);
+ g_regex_match (tag_regex, next_tag, 0, &match_info);
+
+ if (g_match_info_matches (match_info)) {
+ gint start_pos, end_pos;
+ gchar *word = g_match_info_fetch (match_info, 0);
+ g_match_info_fetch_pos (match_info, 0, &start_pos, &end_pos);
+ if (start_pos == 0) {
+ offset = strlen (word);
}
- break;
+ g_free (word);
}
- case 'i':
- case 'b':
- case 'u':
- if (num_open_tags == G_N_ELEMENTS (open_tags))
- return; /* something dodgy is going on, stop parsing */
- open_tags[num_open_tags] = *next_tag;
+ g_match_info_free (match_info);
+ g_regex_unref (tag_regex);
+ g_free (cur_tag);
+ index++;
+ if (offset) {
+ /* OK we found a tag, let's keep track of it */
+ open_tags[num_open_tags] = g_strdup (iter_tag);
++num_open_tags;
break;
- default:
- GST_ERROR ("unexpected tag '%c' (%s)", *next_tag, next_tag);
- g_assert_not_reached ();
- break;
+ }
}
+
+ if (offset) {
+ next_tag += offset;
+ cur = next_tag;
+ continue;
+ }
+
+ if (*next_tag == '<' && *(next_tag + 1) == '/') {
+ end_tag = strchr (cur, '>');
+ if (num_open_tags == 0
+ || g_ascii_strncasecmp (end_tag - 1, open_tags[num_open_tags - 1],
+ strlen (open_tags[num_open_tags - 1]))) {
+ GST_LOG ("broken input, closing tag '%s' is not open", next_tag);
+ memmove (next_tag, end_tag + 1, strlen (end_tag) + 1);
+ next_tag -= strlen (end_tag);
+ } else {
+ --num_open_tags;
+ g_free (open_tags[num_open_tags]);
+ }
+ }
+ ++next_tag;
cur = next_tag;
}
@@ -793,11 +830,12 @@ subrip_fix_up_markup (gchar ** p_txt)
s = g_string_new (*p_txt);
while (num_open_tags > 0) {
- GST_LOG ("adding missing closing tag '%c'", open_tags[num_open_tags - 1]);
+ GST_LOG ("adding missing closing tag '%s'", open_tags[num_open_tags - 1]);
g_string_append_c (s, '<');
g_string_append_c (s, '/');
- g_string_append_c (s, open_tags[num_open_tags - 1]);
+ g_string_append (s, open_tags[num_open_tags - 1]);
g_string_append_c (s, '>');
+ g_free (open_tags[num_open_tags - 1]);
--num_open_tags;
}
g_free (*p_txt);
@@ -857,6 +895,62 @@ parse_subrip_time (const gchar * ts_string, GstClockTime * t)
return TRUE;
}
+/* cue settings are part of the WebVTT specification. They are
+ * declared after the time interval in the first line of the
+ * cue. Example: 00:00:01,000 --> 00:00:02,000 D:vertical-lr A:start
+ * See also http://www.whatwg.org/specs/web-apps/current-work/webvtt.html
+ */
+static void
+parse_webvtt_cue_settings (ParserState * state, const gchar * settings)
+{
+ gchar **splitted_settings = g_strsplit_set (settings, " \t", -1);
+ gint i = 0;
+ gint16 text_position, text_size;
+ gint16 line_position;
+ gboolean vertical_found = FALSE;
+ gboolean alignment_found = FALSE;
+
+ while (i < g_strv_length (splitted_settings)) {
+ switch (splitted_settings[i][0]) {
+ case 'T':
+ sscanf (splitted_settings[i], "T:%" G_GINT16_FORMAT "%%",
+ &text_position);
+ state->text_position = (guint8) text_position;
+ break;
+ case 'D':
+ vertical_found = TRUE;
+ state->vertical = g_strdup (splitted_settings[i] + 2);
+ break;
+ case 'L':
+ if (g_str_has_suffix (splitted_settings[i], "%")) {
+ sscanf (splitted_settings[i], "L:%" G_GINT16_FORMAT "%%",
+ &line_position);
+ state->line_position = line_position;
+ } else {
+ sscanf (splitted_settings[i], "L:%" G_GINT16_FORMAT, &line_position);
+ state->line_number = line_position;
+ }
+ break;
+ case 'S':
+ sscanf (splitted_settings[i], "S:%" G_GINT16_FORMAT "%%", &text_size);
+ state->text_size = (guint8) text_size;
+ break;
+ case 'A':
+ state->alignment = g_strdup (splitted_settings[i] + 2);
+ alignment_found = TRUE;
+ break;
+ default:
+ break;
+ }
+ i++;
+ }
+ g_strfreev (splitted_settings);
+ if (!vertical_found)
+ state->vertical = g_strdup ("");
+ if (!alignment_found)
+ state->alignment = g_strdup ("");
+}
+
static gchar *
parse_subrip (ParserState * state, const gchar * line)
{
@@ -915,10 +1009,11 @@ parse_subrip (ParserState * state, const gchar * line)
ret = g_markup_escape_text (state->buf->str, state->buf->len);
g_string_truncate (state->buf, 0);
state->state = 0;
- subrip_unescape_formatting (ret);
+ subrip_unescape_formatting (ret, state->allowed_tags,
+ state->allows_tag_attributes);
subrip_remove_unhandled_tags (ret);
strip_trailing_newlines (ret);
- subrip_fix_up_markup (&ret);
+ subrip_fix_up_markup (&ret, state->allowed_tags);
return ret;
}
return NULL;
@@ -955,6 +1050,51 @@ parse_lrc (ParserState * state, const gchar * line)
return g_strdup (start + 1);
}
+/* WebVTT is a new subtitle format for the upcoming HTML5 video track
+ * element. This format is similar to Subrip, the biggest differences
+ * are that there can be cue settings detailing how to display the cue
+ * text and more markup tags are allowed.
+ * See also http://www.whatwg.org/specs/web-apps/current-work/webvtt.html
+ */
+static gchar *
+parse_webvtt (ParserState * state, const gchar * line)
+{
+ if (state->state == 1) {
+ GstClockTime ts_start, ts_end;
+ gchar *end_time;
+ gchar *cue_settings = NULL;
+
+ /* looking for start_time --> end_time */
+ if ((end_time = strstr (line, " --> ")) &&
+ parse_subrip_time (line, &ts_start) &&
+ parse_subrip_time (end_time + strlen (" --> "), &ts_end) &&
+ state->start_time <= ts_end) {
+ state->state = 2;
+ state->start_time = ts_start;
+ state->duration = ts_end - ts_start;
+ cue_settings = strstr (end_time + strlen (" --> "), " ");
+ } else {
+ GST_DEBUG ("error parsing subrip time line '%s'", line);
+ state->state = 0;
+ }
+
+ state->text_position = 0;
+ state->text_size = 0;
+ state->line_position = 0;
+ state->line_number = 0;
+
+ if (cue_settings)
+ parse_webvtt_cue_settings (state, cue_settings + 1);
+ else {
+ state->vertical = g_strdup ("");
+ state->alignment = g_strdup ("");
+ }
+
+ return NULL;
+ } else
+ return parse_subrip (state, line);
+}
+
static void
unescape_newlines_br (gchar * read)
{
@@ -1177,6 +1317,7 @@ parser_state_init (ParserState * state)
state->max_duration = 0; /* no limit */
state->state = 0;
state->segment = NULL;
+ state->allowed_tags = NULL;
}
static void
@@ -1198,6 +1339,7 @@ parser_state_dispose (GstSubParse * self, ParserState * state)
break;
}
}
+ state->allowed_tags = NULL;
}
/* regex type enum */
@@ -1207,6 +1349,7 @@ typedef enum
GST_SUB_PARSE_REGEX_MDVDSUB = 1,
GST_SUB_PARSE_REGEX_SUBRIP = 2,
GST_SUB_PARSE_REGEX_DKS = 3,
+ GST_SUB_PARSE_REGEX_VTT = 4,
} GstSubParseRegex;
static gpointer
@@ -1243,6 +1386,16 @@ gst_sub_parse_data_format_autodetect_regex_once (GstSubParseRegex regtype)
g_clear_error (&gerr);
}
break;
+ case GST_SUB_PARSE_REGEX_VTT:
+ result = (gpointer)
+ g_regex_new ("^(\\xef\\xbb\\xbf)?WEBVTT[\\xa\\xd\\x20\\x9]", 0, 0,
+ &gerr);
+ if (result == NULL) {
+ g_warning ("Compilation of vtt regex failed: %s", gerr->message);
+ g_error_free (gerr);
+ }
+ break;
+
default:
GST_WARNING ("Trying to allocate regex of unknown type %u", regtype);
}
@@ -1263,10 +1416,12 @@ gst_sub_parse_data_format_autodetect (gchar * match_str)
static GOnce mdvd_rx_once = G_ONCE_INIT;
static GOnce subrip_rx_once = G_ONCE_INIT;
static GOnce dks_rx_once = G_ONCE_INIT;
+ static GOnce vtt_rx_once = G_ONCE_INIT;
GRegex *mdvd_grx;
GRegex *subrip_grx;
GRegex *dks_grx;
+ GRegex *vtt_grx;
g_once (&mdvd_rx_once,
(GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
@@ -1277,10 +1432,14 @@ gst_sub_parse_data_format_autodetect (gchar * match_str)
g_once (&dks_rx_once,
(GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
(gpointer) GST_SUB_PARSE_REGEX_DKS);
+ g_once (&vtt_rx_once,
+ (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
+ (gpointer) GST_SUB_PARSE_REGEX_VTT);
mdvd_grx = (GRegex *) mdvd_rx_once.retval;
subrip_grx = (GRegex *) subrip_rx_once.retval;
dks_grx = (GRegex *) dks_rx_once.retval;
+ vtt_grx = (GRegex *) vtt_rx_once.retval;
if (g_regex_match (mdvd_grx, match_str, 0, NULL)) {
GST_LOG ("MicroDVD (frame based) format detected");
@@ -1294,6 +1453,10 @@ gst_sub_parse_data_format_autodetect (gchar * match_str)
GST_LOG ("DKS (time based) format detected");
return GST_SUB_PARSE_FORMAT_DKS;
}
+ if (g_regex_match (vtt_grx, match_str, 0, NULL) == TRUE) {
+ GST_LOG ("WebVTT (time based) format detected");
+ return GST_SUB_PARSE_FORMAT_VTT;
+ }
if (!strncmp (match_str, "FORMAT=TIME", 11)) {
GST_LOG ("MPSub (time based) format detected");
@@ -1383,6 +1546,8 @@ gst_sub_parse_format_autodetect (GstSubParse * self)
return gst_caps_new_simple ("text/x-raw",
"format", G_TYPE_STRING, "pango-markup", NULL);
case GST_SUB_PARSE_FORMAT_SUBRIP:
+ self->state.allowed_tags = (gpointer) allowed_srt_tags;
+ self->state.allows_tag_attributes = FALSE;
self->parse_line = parse_subrip;
return gst_caps_new_simple ("text/x-raw",
"format", G_TYPE_STRING, "pango-markup", NULL);
@@ -1408,6 +1573,12 @@ gst_sub_parse_format_autodetect (GstSubParse * self)
self->parse_line = parse_dks;
return gst_caps_new_simple ("text/x-raw",
"format", G_TYPE_STRING, "utf8", NULL);
+ case GST_SUB_PARSE_FORMAT_VTT:
+ self->state.allowed_tags = (gpointer) allowed_vtt_tags;
+ self->state.allows_tag_attributes = TRUE;
+ self->parse_line = parse_webvtt;
+ return gst_caps_new_simple ("text/x-raw",
+ "format", G_TYPE_STRING, "pango-markup", NULL);
case GST_SUB_PARSE_FORMAT_SUBVIEWER:
self->parse_line = parse_subviewer;
return gst_caps_new_simple ("text/x-raw",
@@ -1572,6 +1743,8 @@ handle_buffer (GstSubParse * self, GstBuffer * buf)
GST_TIME_FORMAT, subtitle, GST_TIME_ARGS (self->state.start_time),
GST_TIME_ARGS (self->state.duration));
+ g_free (self->state.vertical);
+ g_free (self->state.alignment);
ret = gst_pad_push (self->srcpad, buf);
/* move this forward (the tmplayer parser needs this) */
@@ -1738,6 +1911,9 @@ static GstStaticCaps smi_caps = GST_STATIC_CAPS ("application/x-subtitle-sami");
static GstStaticCaps dks_caps = GST_STATIC_CAPS ("application/x-subtitle-dks");
#define DKS_CAPS (gst_static_caps_get (&dks_caps))
+static GstStaticCaps vtt_caps = GST_STATIC_CAPS ("application/x-subtitle-vtt");
+#define VTT_CAPS (gst_static_caps_get (&vtt_caps))
+
static GstStaticCaps qttext_caps =
GST_STATIC_CAPS ("application/x-subtitle-qttext");
#define QTTEXT_CAPS (gst_static_caps_get (&qttext_caps))
@@ -1848,6 +2024,9 @@ gst_subparse_type_find (GstTypeFind * tf, gpointer private)
case GST_SUB_PARSE_FORMAT_LRC:
GST_DEBUG ("LRC format detected");
caps = LRC_CAPS;
+ case GST_SUB_PARSE_FORMAT_VTT:
+ GST_DEBUG ("WebVTT format detected");
+ caps = VTT_CAPS;
break;
default:
case GST_SUB_PARSE_FORMAT_UNKNOWN:
@@ -1865,8 +2044,8 @@ plugin_init (GstPlugin * plugin)
GST_DEBUG_CATEGORY_INIT (sub_parse_debug, "subparse", 0, ".sub parser");
if (!gst_type_find_register (plugin, "subparse_typefind", GST_RANK_MARGINAL,
- gst_subparse_type_find, "srt,sub,mpsub,mdvd,smi,txt,dks", SUB_CAPS,
- NULL, NULL))
+ gst_subparse_type_find, "srt,sub,mpsub,mdvd,smi,txt,dks,vtt",
+ SUB_CAPS, NULL, NULL))
return FALSE;
if (!gst_element_register (plugin, "subparse",
diff --git a/gst/subparse/gstsubparse.h b/gst/subparse/gstsubparse.h
index 012aee60d..64aab3463 100644
--- a/gst/subparse/gstsubparse.h
+++ b/gst/subparse/gstsubparse.h
@@ -56,7 +56,8 @@ typedef enum
GST_SUB_PARSE_FORMAT_SUBVIEWER = 7,
GST_SUB_PARSE_FORMAT_DKS = 8,
GST_SUB_PARSE_FORMAT_QTTEXT = 9,
- GST_SUB_PARSE_FORMAT_LRC = 10
+ GST_SUB_PARSE_FORMAT_LRC = 10,
+ GST_SUB_PARSE_FORMAT_VTT = 11
} GstSubParseFormat;
typedef struct {
@@ -69,6 +70,14 @@ typedef struct {
gpointer user_data;
gboolean have_internal_fps; /* If TRUE don't overwrite fps by property */
gint fps_n, fps_d; /* used by frame based parsers */
+ guint8 line_position; /* percent value */
+ gint line_number; /* line number, can be positive or negative */
+ guint8 text_position; /* percent value */
+ guint8 text_size; /* percent value */
+ gchar *vertical; /* "", "vertical", "vertical-lr" */
+ gchar *alignment; /* "", "start", "middle", "end" */
+ gconstpointer allowed_tags; /* list of markup tags allowed in the cue text. */
+ gboolean allows_tag_attributes;
} ParserState;
typedef gchar* (*Parser) (ParserState *state, const gchar *line);