diff options
Diffstat (limited to 'gst')
-rw-r--r-- | gst/subparse/gstsubparse.c | 317 | ||||
-rw-r--r-- | gst/subparse/gstsubparse.h | 11 |
2 files changed, 258 insertions, 70 deletions
diff --git a/gst/subparse/gstsubparse.c b/gst/subparse/gstsubparse.c index d59df57a9..c4c4b47b3 100644 --- a/gst/subparse/gstsubparse.c +++ b/gst/subparse/gstsubparse.c @@ -2,6 +2,8 @@ * Copyright (C) <1999> Erik Walthinsen <omega@cse.ogi.edu> * Copyright (C) 2004 Ronald S. Bultje <rbultje@ronald.bitfreak.net> * Copyright (C) 2006 Tim-Philipp Müller <tim centricular net> + * Copyright (C) 2016 Philippe Normand <pnormand@igalia.com> + * Copyright (C) 2016 Jan Schmidt <jan@centricular.com> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public @@ -39,6 +41,10 @@ GST_DEBUG_CATEGORY (sub_parse_debug); #define DEFAULT_ENCODING NULL +#define ATTRIBUTE_REGEX "\\s?[a-zA-Z0-9\\. \t\\(\\)]*" +static const gchar *allowed_srt_tags[] = { "i", "b", "u", NULL }; +static const gchar *allowed_vtt_tags[] = + { "i", "b", "c", "u", "v", "ruby", "rt", NULL }; enum { @@ -61,7 +67,7 @@ static GstStaticPadTemplate sink_templ = GST_STATIC_PAD_TEMPLATE ("sink", GST_STATIC_CAPS ("application/x-subtitle; application/x-subtitle-sami; " "application/x-subtitle-tmplayer; application/x-subtitle-mpl2; " "application/x-subtitle-dks; application/x-subtitle-qttext;" - "application/x-subtitle-lrc;") + "application/x-subtitle-lrc; application/x-subtitle-vtt") ); static GstStaticPadTemplate src_templ = GST_STATIC_PAD_TEMPLATE ("src", @@ -370,6 +376,8 @@ gst_sub_parse_get_format_description (GstSubParseFormat format) return "SubViewer"; case GST_SUB_PARSE_FORMAT_DKS: return "DKS"; + case GST_SUB_PARSE_FORMAT_VTT: + return "WebVTT"; case GST_SUB_PARSE_FORMAT_QTTEXT: return "QTtext"; case GST_SUB_PARSE_FORMAT_LRC: @@ -663,45 +671,42 @@ strip_trailing_newlines (gchar * txt) * escaping everything (the text between these simple markers isn't * necessarily escaped, so it seems best to do it like this) */ static void -subrip_unescape_formatting (gchar * txt) +subrip_unescape_formatting (gchar * txt, gconstpointer allowed_tags_ptr, + gboolean allows_tag_attributes) { - gchar *pos; + gchar *res; + GRegex *tag_regex; + gchar *allowed_tags_pattern, *search_pattern; + const gchar *replace_pattern; - for (pos = txt; pos != NULL && *pos != '\0'; ++pos) { - if (g_ascii_strncasecmp (pos, "<u>", 9) == 0 || - g_ascii_strncasecmp (pos, "<i>", 9) == 0 || - g_ascii_strncasecmp (pos, "<b>", 9) == 0) { - pos[0] = '<'; - pos[1] = g_ascii_tolower (pos[4]); - pos[2] = '>'; - /* move NUL terminator as well */ - memmove (pos + 3, pos + 9, strlen (pos + 9) + 1); - pos += 2; - } + /* No processing needed if no escaped tag marker found in the string. */ + if (strstr (txt, "<") == NULL) + return; + + /* Build a list of alternates for our regexp. + * FIXME: Could be built once and stored */ + allowed_tags_pattern = g_strjoinv ("|", (gchar **) allowed_tags_ptr); + /* Look for starting/ending escaped tags with optional attributes. */ + search_pattern = g_strdup_printf ("<(/)?\\ *(%s)(%s)>", + allowed_tags_pattern, ATTRIBUTE_REGEX); + /* And unescape appropriately */ + if (allows_tag_attributes) { + replace_pattern = "<\\1\\2\\3>"; + } else { + replace_pattern = "<\\1\\2>"; } - for (pos = txt; pos != NULL && *pos != '\0'; ++pos) { - gchar *tag; + tag_regex = g_regex_new (search_pattern, 0, 0, NULL); + res = g_regex_replace (tag_regex, txt, strlen (txt), 0, + replace_pattern, 0, NULL); - /* look for start of an escaped closing tag */ - if (g_ascii_strncasecmp (pos, "</", 5) != 0) - continue; - tag = pos + 5; - while (*tag == ' ') - ++tag; - if ((*tag == 'u' || *tag == 'i' || *tag == 'b') && - g_ascii_strncasecmp (tag + 1, ">", 4) == 0) { - gsize tag_len = (guintptr) (tag + 1 + 4 - pos); - - pos[0] = '<'; - pos[1] = '/'; - pos[2] = g_ascii_tolower (*tag); - pos[3] = '>'; - /* move NUL terminator as well */ - memmove (pos + 4, pos + tag_len, strlen (pos + tag_len) + 1); - pos += 3; - } - } + /* res will always be shorter than the input or identical, so this + * copy is OK */ + strcpy (txt, res); + + g_free (res); + g_free (search_pattern); + g_free (allowed_tags_pattern); } @@ -740,16 +745,25 @@ subrip_remove_unhandled_tags (gchar * txt) } } -/* we only allow <i>, <u> and <b>, so let's take a simple approach. This code - * assumes the input has been escaped and subrip_unescape_formatting() has then - * been run over the input! This function adds missing closing markup tags and - * removes broken closing tags for tags that have never been opened. */ +/* we only allow a fixed set of tags like <i>, <u> and <b>, so let's + * take a simple approach. This code assumes the input has been + * escaped and subrip_unescape_formatting() has then been run over the + * input! This function adds missing closing markup tags and removes + * broken closing tags for tags that have never been opened. */ static void -subrip_fix_up_markup (gchar ** p_txt) +subrip_fix_up_markup (gchar ** p_txt, gconstpointer allowed_tags_ptr) { gchar *cur, *next_tag; - gchar open_tags[32]; + gchar *open_tags[32]; guint num_open_tags = 0; + const gchar *iter_tag; + guint offset = 0; + guint index; + gchar *cur_tag; + gchar *end_tag; + GRegex *tag_regex; + GMatchInfo *match_info; + gchar **allowed_tags = (gchar **) allowed_tags_ptr; g_assert (*p_txt != NULL); @@ -758,33 +772,56 @@ subrip_fix_up_markup (gchar ** p_txt) next_tag = strchr (cur, '<'); if (next_tag == NULL) break; - ++next_tag; - switch (*next_tag) { - case '/':{ - ++next_tag; - if (num_open_tags == 0 || open_tags[num_open_tags - 1] != *next_tag) { - GST_LOG ("broken input, closing tag '%c' is not open", *next_tag); - memmove (next_tag - 2, next_tag + 2, strlen (next_tag + 2) + 1); - next_tag -= 2; - } else { - /* it's all good, closing tag which is open */ - --num_open_tags; + offset = 0; + index = 0; + while (index < g_strv_length (allowed_tags)) { + iter_tag = allowed_tags[index]; + /* Look for a white listed tag */ + cur_tag = g_strconcat ("<", iter_tag, ATTRIBUTE_REGEX, ">", NULL); + tag_regex = g_regex_new (cur_tag, 0, 0, NULL); + g_regex_match (tag_regex, next_tag, 0, &match_info); + + if (g_match_info_matches (match_info)) { + gint start_pos, end_pos; + gchar *word = g_match_info_fetch (match_info, 0); + g_match_info_fetch_pos (match_info, 0, &start_pos, &end_pos); + if (start_pos == 0) { + offset = strlen (word); } - break; + g_free (word); } - case 'i': - case 'b': - case 'u': - if (num_open_tags == G_N_ELEMENTS (open_tags)) - return; /* something dodgy is going on, stop parsing */ - open_tags[num_open_tags] = *next_tag; + g_match_info_free (match_info); + g_regex_unref (tag_regex); + g_free (cur_tag); + index++; + if (offset) { + /* OK we found a tag, let's keep track of it */ + open_tags[num_open_tags] = g_strdup (iter_tag); ++num_open_tags; break; - default: - GST_ERROR ("unexpected tag '%c' (%s)", *next_tag, next_tag); - g_assert_not_reached (); - break; + } } + + if (offset) { + next_tag += offset; + cur = next_tag; + continue; + } + + if (*next_tag == '<' && *(next_tag + 1) == '/') { + end_tag = strchr (cur, '>'); + if (num_open_tags == 0 + || g_ascii_strncasecmp (end_tag - 1, open_tags[num_open_tags - 1], + strlen (open_tags[num_open_tags - 1]))) { + GST_LOG ("broken input, closing tag '%s' is not open", next_tag); + memmove (next_tag, end_tag + 1, strlen (end_tag) + 1); + next_tag -= strlen (end_tag); + } else { + --num_open_tags; + g_free (open_tags[num_open_tags]); + } + } + ++next_tag; cur = next_tag; } @@ -793,11 +830,12 @@ subrip_fix_up_markup (gchar ** p_txt) s = g_string_new (*p_txt); while (num_open_tags > 0) { - GST_LOG ("adding missing closing tag '%c'", open_tags[num_open_tags - 1]); + GST_LOG ("adding missing closing tag '%s'", open_tags[num_open_tags - 1]); g_string_append_c (s, '<'); g_string_append_c (s, '/'); - g_string_append_c (s, open_tags[num_open_tags - 1]); + g_string_append (s, open_tags[num_open_tags - 1]); g_string_append_c (s, '>'); + g_free (open_tags[num_open_tags - 1]); --num_open_tags; } g_free (*p_txt); @@ -857,6 +895,62 @@ parse_subrip_time (const gchar * ts_string, GstClockTime * t) return TRUE; } +/* cue settings are part of the WebVTT specification. They are + * declared after the time interval in the first line of the + * cue. Example: 00:00:01,000 --> 00:00:02,000 D:vertical-lr A:start + * See also http://www.whatwg.org/specs/web-apps/current-work/webvtt.html + */ +static void +parse_webvtt_cue_settings (ParserState * state, const gchar * settings) +{ + gchar **splitted_settings = g_strsplit_set (settings, " \t", -1); + gint i = 0; + gint16 text_position, text_size; + gint16 line_position; + gboolean vertical_found = FALSE; + gboolean alignment_found = FALSE; + + while (i < g_strv_length (splitted_settings)) { + switch (splitted_settings[i][0]) { + case 'T': + sscanf (splitted_settings[i], "T:%" G_GINT16_FORMAT "%%", + &text_position); + state->text_position = (guint8) text_position; + break; + case 'D': + vertical_found = TRUE; + state->vertical = g_strdup (splitted_settings[i] + 2); + break; + case 'L': + if (g_str_has_suffix (splitted_settings[i], "%")) { + sscanf (splitted_settings[i], "L:%" G_GINT16_FORMAT "%%", + &line_position); + state->line_position = line_position; + } else { + sscanf (splitted_settings[i], "L:%" G_GINT16_FORMAT, &line_position); + state->line_number = line_position; + } + break; + case 'S': + sscanf (splitted_settings[i], "S:%" G_GINT16_FORMAT "%%", &text_size); + state->text_size = (guint8) text_size; + break; + case 'A': + state->alignment = g_strdup (splitted_settings[i] + 2); + alignment_found = TRUE; + break; + default: + break; + } + i++; + } + g_strfreev (splitted_settings); + if (!vertical_found) + state->vertical = g_strdup (""); + if (!alignment_found) + state->alignment = g_strdup (""); +} + static gchar * parse_subrip (ParserState * state, const gchar * line) { @@ -915,10 +1009,11 @@ parse_subrip (ParserState * state, const gchar * line) ret = g_markup_escape_text (state->buf->str, state->buf->len); g_string_truncate (state->buf, 0); state->state = 0; - subrip_unescape_formatting (ret); + subrip_unescape_formatting (ret, state->allowed_tags, + state->allows_tag_attributes); subrip_remove_unhandled_tags (ret); strip_trailing_newlines (ret); - subrip_fix_up_markup (&ret); + subrip_fix_up_markup (&ret, state->allowed_tags); return ret; } return NULL; @@ -955,6 +1050,51 @@ parse_lrc (ParserState * state, const gchar * line) return g_strdup (start + 1); } +/* WebVTT is a new subtitle format for the upcoming HTML5 video track + * element. This format is similar to Subrip, the biggest differences + * are that there can be cue settings detailing how to display the cue + * text and more markup tags are allowed. + * See also http://www.whatwg.org/specs/web-apps/current-work/webvtt.html + */ +static gchar * +parse_webvtt (ParserState * state, const gchar * line) +{ + if (state->state == 1) { + GstClockTime ts_start, ts_end; + gchar *end_time; + gchar *cue_settings = NULL; + + /* looking for start_time --> end_time */ + if ((end_time = strstr (line, " --> ")) && + parse_subrip_time (line, &ts_start) && + parse_subrip_time (end_time + strlen (" --> "), &ts_end) && + state->start_time <= ts_end) { + state->state = 2; + state->start_time = ts_start; + state->duration = ts_end - ts_start; + cue_settings = strstr (end_time + strlen (" --> "), " "); + } else { + GST_DEBUG ("error parsing subrip time line '%s'", line); + state->state = 0; + } + + state->text_position = 0; + state->text_size = 0; + state->line_position = 0; + state->line_number = 0; + + if (cue_settings) + parse_webvtt_cue_settings (state, cue_settings + 1); + else { + state->vertical = g_strdup (""); + state->alignment = g_strdup (""); + } + + return NULL; + } else + return parse_subrip (state, line); +} + static void unescape_newlines_br (gchar * read) { @@ -1177,6 +1317,7 @@ parser_state_init (ParserState * state) state->max_duration = 0; /* no limit */ state->state = 0; state->segment = NULL; + state->allowed_tags = NULL; } static void @@ -1198,6 +1339,7 @@ parser_state_dispose (GstSubParse * self, ParserState * state) break; } } + state->allowed_tags = NULL; } /* regex type enum */ @@ -1207,6 +1349,7 @@ typedef enum GST_SUB_PARSE_REGEX_MDVDSUB = 1, GST_SUB_PARSE_REGEX_SUBRIP = 2, GST_SUB_PARSE_REGEX_DKS = 3, + GST_SUB_PARSE_REGEX_VTT = 4, } GstSubParseRegex; static gpointer @@ -1243,6 +1386,16 @@ gst_sub_parse_data_format_autodetect_regex_once (GstSubParseRegex regtype) g_clear_error (&gerr); } break; + case GST_SUB_PARSE_REGEX_VTT: + result = (gpointer) + g_regex_new ("^(\\xef\\xbb\\xbf)?WEBVTT[\\xa\\xd\\x20\\x9]", 0, 0, + &gerr); + if (result == NULL) { + g_warning ("Compilation of vtt regex failed: %s", gerr->message); + g_error_free (gerr); + } + break; + default: GST_WARNING ("Trying to allocate regex of unknown type %u", regtype); } @@ -1263,10 +1416,12 @@ gst_sub_parse_data_format_autodetect (gchar * match_str) static GOnce mdvd_rx_once = G_ONCE_INIT; static GOnce subrip_rx_once = G_ONCE_INIT; static GOnce dks_rx_once = G_ONCE_INIT; + static GOnce vtt_rx_once = G_ONCE_INIT; GRegex *mdvd_grx; GRegex *subrip_grx; GRegex *dks_grx; + GRegex *vtt_grx; g_once (&mdvd_rx_once, (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once, @@ -1277,10 +1432,14 @@ gst_sub_parse_data_format_autodetect (gchar * match_str) g_once (&dks_rx_once, (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once, (gpointer) GST_SUB_PARSE_REGEX_DKS); + g_once (&vtt_rx_once, + (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once, + (gpointer) GST_SUB_PARSE_REGEX_VTT); mdvd_grx = (GRegex *) mdvd_rx_once.retval; subrip_grx = (GRegex *) subrip_rx_once.retval; dks_grx = (GRegex *) dks_rx_once.retval; + vtt_grx = (GRegex *) vtt_rx_once.retval; if (g_regex_match (mdvd_grx, match_str, 0, NULL)) { GST_LOG ("MicroDVD (frame based) format detected"); @@ -1294,6 +1453,10 @@ gst_sub_parse_data_format_autodetect (gchar * match_str) GST_LOG ("DKS (time based) format detected"); return GST_SUB_PARSE_FORMAT_DKS; } + if (g_regex_match (vtt_grx, match_str, 0, NULL) == TRUE) { + GST_LOG ("WebVTT (time based) format detected"); + return GST_SUB_PARSE_FORMAT_VTT; + } if (!strncmp (match_str, "FORMAT=TIME", 11)) { GST_LOG ("MPSub (time based) format detected"); @@ -1383,6 +1546,8 @@ gst_sub_parse_format_autodetect (GstSubParse * self) return gst_caps_new_simple ("text/x-raw", "format", G_TYPE_STRING, "pango-markup", NULL); case GST_SUB_PARSE_FORMAT_SUBRIP: + self->state.allowed_tags = (gpointer) allowed_srt_tags; + self->state.allows_tag_attributes = FALSE; self->parse_line = parse_subrip; return gst_caps_new_simple ("text/x-raw", "format", G_TYPE_STRING, "pango-markup", NULL); @@ -1408,6 +1573,12 @@ gst_sub_parse_format_autodetect (GstSubParse * self) self->parse_line = parse_dks; return gst_caps_new_simple ("text/x-raw", "format", G_TYPE_STRING, "utf8", NULL); + case GST_SUB_PARSE_FORMAT_VTT: + self->state.allowed_tags = (gpointer) allowed_vtt_tags; + self->state.allows_tag_attributes = TRUE; + self->parse_line = parse_webvtt; + return gst_caps_new_simple ("text/x-raw", + "format", G_TYPE_STRING, "pango-markup", NULL); case GST_SUB_PARSE_FORMAT_SUBVIEWER: self->parse_line = parse_subviewer; return gst_caps_new_simple ("text/x-raw", @@ -1572,6 +1743,8 @@ handle_buffer (GstSubParse * self, GstBuffer * buf) GST_TIME_FORMAT, subtitle, GST_TIME_ARGS (self->state.start_time), GST_TIME_ARGS (self->state.duration)); + g_free (self->state.vertical); + g_free (self->state.alignment); ret = gst_pad_push (self->srcpad, buf); /* move this forward (the tmplayer parser needs this) */ @@ -1738,6 +1911,9 @@ static GstStaticCaps smi_caps = GST_STATIC_CAPS ("application/x-subtitle-sami"); static GstStaticCaps dks_caps = GST_STATIC_CAPS ("application/x-subtitle-dks"); #define DKS_CAPS (gst_static_caps_get (&dks_caps)) +static GstStaticCaps vtt_caps = GST_STATIC_CAPS ("application/x-subtitle-vtt"); +#define VTT_CAPS (gst_static_caps_get (&vtt_caps)) + static GstStaticCaps qttext_caps = GST_STATIC_CAPS ("application/x-subtitle-qttext"); #define QTTEXT_CAPS (gst_static_caps_get (&qttext_caps)) @@ -1848,6 +2024,9 @@ gst_subparse_type_find (GstTypeFind * tf, gpointer private) case GST_SUB_PARSE_FORMAT_LRC: GST_DEBUG ("LRC format detected"); caps = LRC_CAPS; + case GST_SUB_PARSE_FORMAT_VTT: + GST_DEBUG ("WebVTT format detected"); + caps = VTT_CAPS; break; default: case GST_SUB_PARSE_FORMAT_UNKNOWN: @@ -1865,8 +2044,8 @@ plugin_init (GstPlugin * plugin) GST_DEBUG_CATEGORY_INIT (sub_parse_debug, "subparse", 0, ".sub parser"); if (!gst_type_find_register (plugin, "subparse_typefind", GST_RANK_MARGINAL, - gst_subparse_type_find, "srt,sub,mpsub,mdvd,smi,txt,dks", SUB_CAPS, - NULL, NULL)) + gst_subparse_type_find, "srt,sub,mpsub,mdvd,smi,txt,dks,vtt", + SUB_CAPS, NULL, NULL)) return FALSE; if (!gst_element_register (plugin, "subparse", diff --git a/gst/subparse/gstsubparse.h b/gst/subparse/gstsubparse.h index 012aee60d..64aab3463 100644 --- a/gst/subparse/gstsubparse.h +++ b/gst/subparse/gstsubparse.h @@ -56,7 +56,8 @@ typedef enum GST_SUB_PARSE_FORMAT_SUBVIEWER = 7, GST_SUB_PARSE_FORMAT_DKS = 8, GST_SUB_PARSE_FORMAT_QTTEXT = 9, - GST_SUB_PARSE_FORMAT_LRC = 10 + GST_SUB_PARSE_FORMAT_LRC = 10, + GST_SUB_PARSE_FORMAT_VTT = 11 } GstSubParseFormat; typedef struct { @@ -69,6 +70,14 @@ typedef struct { gpointer user_data; gboolean have_internal_fps; /* If TRUE don't overwrite fps by property */ gint fps_n, fps_d; /* used by frame based parsers */ + guint8 line_position; /* percent value */ + gint line_number; /* line number, can be positive or negative */ + guint8 text_position; /* percent value */ + guint8 text_size; /* percent value */ + gchar *vertical; /* "", "vertical", "vertical-lr" */ + gchar *alignment; /* "", "start", "middle", "end" */ + gconstpointer allowed_tags; /* list of markup tags allowed in the cue text. */ + gboolean allows_tag_attributes; } ParserState; typedef gchar* (*Parser) (ParserState *state, const gchar *line); |