2 files changed, 258 insertions, 70 deletions
diff --git a/gst/subparse/gstsubparse.c b/gst/subparse/gstsubparse.c
index d59df57a9..c4c4b47b3 100644
--- a/gst/subparse/gstsubparse.c
+++ b/gst/subparse/gstsubparse.c
@@ -2,6 +2,8 @@
  * Copyright (C) <1999> Erik Walthinsen <omega@cse.ogi.edu>
  * Copyright (C) 2004 Ronald S. Bultje <rbultje@ronald.bitfreak.net>
  * Copyright (C) 2006 Tim-Philipp Müller <tim centricular net>
+ * Copyright (C) 2016 Philippe Normand <pnormand@igalia.com>
+ * Copyright (C) 2016 Jan Schmidt <jan@centricular.com>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Library General Public
@@ -39,6 +41,10 @@
 GST_DEBUG_CATEGORY (sub_parse_debug);
 
 #define DEFAULT_ENCODING   NULL
+#define ATTRIBUTE_REGEX "\\s?[a-zA-Z0-9\\. \t\\(\\)]*"
+static const gchar *allowed_srt_tags[] = { "i", "b", "u", NULL };
+static const gchar *allowed_vtt_tags[] =
+    { "i", "b", "c", "u", "v", "ruby", "rt", NULL };
 
 enum
 {
@@ -61,7 +67,7 @@ static GstStaticPadTemplate sink_templ = GST_STATIC_PAD_TEMPLATE ("sink",
     GST_STATIC_CAPS ("application/x-subtitle; application/x-subtitle-sami; "
         "application/x-subtitle-tmplayer; application/x-subtitle-mpl2; "
         "application/x-subtitle-dks; application/x-subtitle-qttext;"
-        "application/x-subtitle-lrc;")
+        "application/x-subtitle-lrc; application/x-subtitle-vtt")
     );
 
 static GstStaticPadTemplate src_templ = GST_STATIC_PAD_TEMPLATE ("src",
@@ -370,6 +376,8 @@ gst_sub_parse_get_format_description (GstSubParseFormat format)
       return "SubViewer";
     case GST_SUB_PARSE_FORMAT_DKS:
       return "DKS";
+    case GST_SUB_PARSE_FORMAT_VTT:
+      return "WebVTT";
     case GST_SUB_PARSE_FORMAT_QTTEXT:
       return "QTtext";
     case GST_SUB_PARSE_FORMAT_LRC:
@@ -663,45 +671,42 @@ strip_trailing_newlines (gchar * txt)
  * escaping everything (the text between these simple markers isn't
  * necessarily escaped, so it seems best to do it like this) */
 static void
-subrip_unescape_formatting (gchar * txt)
+subrip_unescape_formatting (gchar * txt, gconstpointer allowed_tags_ptr,
+    gboolean allows_tag_attributes)
 {
-  gchar *pos;
+  gchar *res;
+  GRegex *tag_regex;
+  gchar *allowed_tags_pattern, *search_pattern;
+  const gchar *replace_pattern;
 
-  for (pos = txt; pos != NULL && *pos != '\0'; ++pos) {
-    if (g_ascii_strncasecmp (pos, "&lt;u&gt;", 9) == 0 ||
-        g_ascii_strncasecmp (pos, "&lt;i&gt;", 9) == 0 ||
-        g_ascii_strncasecmp (pos, "&lt;b&gt;", 9) == 0) {
-      pos[0] = '<';
-      pos[1] = g_ascii_tolower (pos[4]);
-      pos[2] = '>';
-      /* move NUL terminator as well */
-      memmove (pos + 3, pos + 9, strlen (pos + 9) + 1);
-      pos += 2;
-    }
+  /* No processing needed if no escaped tag marker found in the string. */
+  if (strstr (txt, "&lt;") == NULL)
+    return;
+
+  /* Build a list of alternates for our regexp.
+   * FIXME: Could be built once and stored */
+  allowed_tags_pattern = g_strjoinv ("|", (gchar **) allowed_tags_ptr);
+  /* Look for starting/ending escaped tags with optional attributes. */
+  search_pattern = g_strdup_printf ("&lt;(/)?\\ *(%s)(%s)&gt;",
+      allowed_tags_pattern, ATTRIBUTE_REGEX);
+  /* And unescape appropriately */
+  if (allows_tag_attributes) {
+    replace_pattern = "<\\1\\2\\3>";
+  } else {
+    replace_pattern = "<\\1\\2>";
   }
 
-  for (pos = txt; pos != NULL && *pos != '\0'; ++pos) {
-    gchar *tag;
+  tag_regex = g_regex_new (search_pattern, 0, 0, NULL);
+  res = g_regex_replace (tag_regex, txt, strlen (txt), 0,
+      replace_pattern, 0, NULL);
 
-    /* look for start of an escaped closing tag */
-    if (g_ascii_strncasecmp (pos, "&lt;/", 5) != 0)
-      continue;
-    tag = pos + 5;
-    while (*tag == ' ')
-      ++tag;
-    if ((*tag == 'u' || *tag == 'i' || *tag == 'b') &&
-        g_ascii_strncasecmp (tag + 1, "&gt;", 4) == 0) {
-      gsize tag_len = (guintptr) (tag + 1 + 4 - pos);
-
-      pos[0] = '<';
-      pos[1] = '/';
-      pos[2] = g_ascii_tolower (*tag);
-      pos[3] = '>';
-      /* move NUL terminator as well */
-      memmove (pos + 4, pos + tag_len, strlen (pos + tag_len) + 1);
-      pos += 3;
-    }
-  }
+  /* res will always be shorter than the input or identical, so this
+   * copy is OK */
+  strcpy (txt, res);
+
+  g_free (res);
+  g_free (search_pattern);
+  g_free (allowed_tags_pattern);
 }
 
 
@@ -740,16 +745,25 @@ subrip_remove_unhandled_tags (gchar * txt)
   }
 }
 
-/* we only allow <i>, <u> and <b>, so let's take a simple approach. This code
- * assumes the input has been escaped and subrip_unescape_formatting() has then
- * been run over the input! This function adds missing closing markup tags and
- * removes broken closing tags for tags that have never been opened. */
+/* we only allow a fixed set of tags like <i>, <u> and <b>, so let's
+ * take a simple approach. This code assumes the input has been
+ * escaped and subrip_unescape_formatting() has then been run over the
+ * input! This function adds missing closing markup tags and removes
+ * broken closing tags for tags that have never been opened. */
 static void
-subrip_fix_up_markup (gchar ** p_txt)
+subrip_fix_up_markup (gchar ** p_txt, gconstpointer allowed_tags_ptr)
 {
   gchar *cur, *next_tag;
-  gchar open_tags[32];
+  gchar *open_tags[32];
   guint num_open_tags = 0;
+  const gchar *iter_tag;
+  guint offset = 0;
+  guint index;
+  gchar *cur_tag;
+  gchar *end_tag;
+  GRegex *tag_regex;
+  GMatchInfo *match_info;
+  gchar **allowed_tags = (gchar **) allowed_tags_ptr;
 
   g_assert (*p_txt != NULL);
 
@@ -758,33 +772,56 @@ subrip_fix_up_markup (gchar ** p_txt)
     next_tag = strchr (cur, '<');
     if (next_tag == NULL)
       break;
-    ++next_tag;
-    switch (*next_tag) {
-      case '/':{
-        ++next_tag;
-        if (num_open_tags == 0 || open_tags[num_open_tags - 1] != *next_tag) {
-          GST_LOG ("broken input, closing tag '%c' is not open", *next_tag);
-          memmove (next_tag - 2, next_tag + 2, strlen (next_tag + 2) + 1);
-          next_tag -= 2;
-        } else {
-          /* it's all good, closing tag which is open */
-          --num_open_tags;
+    offset = 0;
+    index = 0;
+    while (index < g_strv_length (allowed_tags)) {
+      iter_tag = allowed_tags[index];
+      /* Look for a white listed tag */
+      cur_tag = g_strconcat ("<", iter_tag, ATTRIBUTE_REGEX, ">", NULL);
+      tag_regex = g_regex_new (cur_tag, 0, 0, NULL);
+      g_regex_match (tag_regex, next_tag, 0, &match_info);
+
+      if (g_match_info_matches (match_info)) {
+        gint start_pos, end_pos;
+        gchar *word = g_match_info_fetch (match_info, 0);
+        g_match_info_fetch_pos (match_info, 0, &start_pos, &end_pos);
+        if (start_pos == 0) {
+          offset = strlen (word);
         }
-        break;
+        g_free (word);
       }
-      case 'i':
-      case 'b':
-      case 'u':
-        if (num_open_tags == G_N_ELEMENTS (open_tags))
-          return;               /* something dodgy is going on, stop parsing */
-        open_tags[num_open_tags] = *next_tag;
+      g_match_info_free (match_info);
+      g_regex_unref (tag_regex);
+      g_free (cur_tag);
+      index++;
+      if (offset) {
+        /* OK we found a tag, let's keep track of it */
+        open_tags[num_open_tags] = g_strdup (iter_tag);
         ++num_open_tags;
         break;
-      default:
-        GST_ERROR ("unexpected tag '%c' (%s)", *next_tag, next_tag);
-        g_assert_not_reached ();
-        break;
+      }
     }
+
+    if (offset) {
+      next_tag += offset;
+      cur = next_tag;
+      continue;
+    }
+
+    if (*next_tag == '<' && *(next_tag + 1) == '/') {
+      end_tag = strchr (cur, '>');
+      if (num_open_tags == 0
+          || g_ascii_strncasecmp (end_tag - 1, open_tags[num_open_tags - 1],
+              strlen (open_tags[num_open_tags - 1]))) {
+        GST_LOG ("broken input, closing tag '%s' is not open", next_tag);
+        memmove (next_tag, end_tag + 1, strlen (end_tag) + 1);
+        next_tag -= strlen (end_tag);
+      } else {
+        --num_open_tags;
+        g_free (open_tags[num_open_tags]);
+      }
+    }
+    ++next_tag;
     cur = next_tag;
   }
 
@@ -793,11 +830,12 @@ subrip_fix_up_markup (gchar ** p_txt)
 
     s = g_string_new (*p_txt);
     while (num_open_tags > 0) {
-      GST_LOG ("adding missing closing tag '%c'", open_tags[num_open_tags - 1]);
+      GST_LOG ("adding missing closing tag '%s'", open_tags[num_open_tags - 1]);
       g_string_append_c (s, '<');
       g_string_append_c (s, '/');
-      g_string_append_c (s, open_tags[num_open_tags - 1]);
+      g_string_append (s, open_tags[num_open_tags - 1]);
       g_string_append_c (s, '>');
+      g_free (open_tags[num_open_tags - 1]);
       --num_open_tags;
     }
     g_free (*p_txt);
@@ -857,6 +895,62 @@ parse_subrip_time (const gchar * ts_string, GstClockTime * t)
   return TRUE;
 }
 
+/* cue settings are part of the WebVTT specification. They are
+ * declared after the time interval in the first line of the
+ * cue. Example: 00:00:01,000 --> 00:00:02,000 D:vertical-lr A:start
+ * See also http://www.whatwg.org/specs/web-apps/current-work/webvtt.html
+ */
+static void
+parse_webvtt_cue_settings (ParserState * state, const gchar * settings)
+{
+  gchar **splitted_settings = g_strsplit_set (settings, " \t", -1);
+  gint i = 0;
+  gint16 text_position, text_size;
+  gint16 line_position;
+  gboolean vertical_found = FALSE;
+  gboolean alignment_found = FALSE;
+
+  while (i < g_strv_length (splitted_settings)) {
+    switch (splitted_settings[i][0]) {
+      case 'T':
+        sscanf (splitted_settings[i], "T:%" G_GINT16_FORMAT "%%",
+            &text_position);
+        state->text_position = (guint8) text_position;
+        break;
+      case 'D':
+        vertical_found = TRUE;
+        state->vertical = g_strdup (splitted_settings[i] + 2);
+        break;
+      case 'L':
+        if (g_str_has_suffix (splitted_settings[i], "%")) {
+          sscanf (splitted_settings[i], "L:%" G_GINT16_FORMAT "%%",
+              &line_position);
+          state->line_position = line_position;
+        } else {
+          sscanf (splitted_settings[i], "L:%" G_GINT16_FORMAT, &line_position);
+          state->line_number = line_position;
+        }
+        break;
+      case 'S':
+        sscanf (splitted_settings[i], "S:%" G_GINT16_FORMAT "%%", &text_size);
+        state->text_size = (guint8) text_size;
+        break;
+      case 'A':
+        state->alignment = g_strdup (splitted_settings[i] + 2);
+        alignment_found = TRUE;
+        break;
+      default:
+        break;
+    }
+    i++;
+  }
+  g_strfreev (splitted_settings);
+  if (!vertical_found)
+    state->vertical = g_strdup ("");
+  if (!alignment_found)
+    state->alignment = g_strdup ("");
+}
+
 static gchar *
 parse_subrip (ParserState * state, const gchar * line)
 {
@@ -915,10 +1009,11 @@ parse_subrip (ParserState * state, const gchar * line)
         ret = g_markup_escape_text (state->buf->str, state->buf->len);
         g_string_truncate (state->buf, 0);
         state->state = 0;
-        subrip_unescape_formatting (ret);
+        subrip_unescape_formatting (ret, state->allowed_tags,
+            state->allows_tag_attributes);
         subrip_remove_unhandled_tags (ret);
         strip_trailing_newlines (ret);
-        subrip_fix_up_markup (&ret);
+        subrip_fix_up_markup (&ret, state->allowed_tags);
         return ret;
       }
       return NULL;
@@ -955,6 +1050,51 @@ parse_lrc (ParserState * state, const gchar * line)
   return g_strdup (start + 1);
 }
 
+/* WebVTT is a new subtitle format for the upcoming HTML5 video track
+ * element. This format is similar to Subrip, the biggest differences
+ * are that there can be cue settings detailing how to display the cue
+ * text and more markup tags are allowed.
+ * See also http://www.whatwg.org/specs/web-apps/current-work/webvtt.html
+ */
+static gchar *
+parse_webvtt (ParserState * state, const gchar * line)
+{
+  if (state->state == 1) {
+    GstClockTime ts_start, ts_end;
+    gchar *end_time;
+    gchar *cue_settings = NULL;
+
+    /* looking for start_time --> end_time */
+    if ((end_time = strstr (line, " --> ")) &&
+        parse_subrip_time (line, &ts_start) &&
+        parse_subrip_time (end_time + strlen (" --> "), &ts_end) &&
+        state->start_time <= ts_end) {
+      state->state = 2;
+      state->start_time = ts_start;
+      state->duration = ts_end - ts_start;
+      cue_settings = strstr (end_time + strlen (" --> "), " ");
+    } else {
+      GST_DEBUG ("error parsing subrip time line '%s'", line);
+      state->state = 0;
+    }
+
+    state->text_position = 0;
+    state->text_size = 0;
+    state->line_position = 0;
+    state->line_number = 0;
+
+    if (cue_settings)
+      parse_webvtt_cue_settings (state, cue_settings + 1);
+    else {
+      state->vertical = g_strdup ("");
+      state->alignment = g_strdup ("");
+    }
+
+    return NULL;
+  } else
+    return parse_subrip (state, line);
+}
+
 static void
 unescape_newlines_br (gchar * read)
 {
@@ -1177,6 +1317,7 @@ parser_state_init (ParserState * state)
   state->max_duration = 0;      /* no limit */
   state->state = 0;
   state->segment = NULL;
+  state->allowed_tags = NULL;
 }
 
 static void
@@ -1198,6 +1339,7 @@ parser_state_dispose (GstSubParse * self, ParserState * state)
         break;
     }
   }
+  state->allowed_tags = NULL;
 }
 
 /* regex type enum */
@@ -1207,6 +1349,7 @@ typedef enum
   GST_SUB_PARSE_REGEX_MDVDSUB = 1,
   GST_SUB_PARSE_REGEX_SUBRIP = 2,
   GST_SUB_PARSE_REGEX_DKS = 3,
+  GST_SUB_PARSE_REGEX_VTT = 4,
 } GstSubParseRegex;
 
 static gpointer
@@ -1243,6 +1386,16 @@ gst_sub_parse_data_format_autodetect_regex_once (GstSubParseRegex regtype)
         g_clear_error (&gerr);
       }
       break;
+    case GST_SUB_PARSE_REGEX_VTT:
+      result = (gpointer)
+          g_regex_new ("^(\\xef\\xbb\\xbf)?WEBVTT[\\xa\\xd\\x20\\x9]", 0, 0,
+          &gerr);
+      if (result == NULL) {
+        g_warning ("Compilation of vtt regex failed: %s", gerr->message);
+        g_error_free (gerr);
+      }
+      break;
+
     default:
       GST_WARNING ("Trying to allocate regex of unknown type %u", regtype);
   }
@@ -1263,10 +1416,12 @@ gst_sub_parse_data_format_autodetect (gchar * match_str)
   static GOnce mdvd_rx_once = G_ONCE_INIT;
   static GOnce subrip_rx_once = G_ONCE_INIT;
   static GOnce dks_rx_once = G_ONCE_INIT;
+  static GOnce vtt_rx_once = G_ONCE_INIT;
 
   GRegex *mdvd_grx;
   GRegex *subrip_grx;
   GRegex *dks_grx;
+  GRegex *vtt_grx;
 
   g_once (&mdvd_rx_once,
       (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
@@ -1277,10 +1432,14 @@ gst_sub_parse_data_format_autodetect (gchar * match_str)
   g_once (&dks_rx_once,
       (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
       (gpointer) GST_SUB_PARSE_REGEX_DKS);
+  g_once (&vtt_rx_once,
+      (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
+      (gpointer) GST_SUB_PARSE_REGEX_VTT);
 
   mdvd_grx = (GRegex *) mdvd_rx_once.retval;
   subrip_grx = (GRegex *) subrip_rx_once.retval;
   dks_grx = (GRegex *) dks_rx_once.retval;
+  vtt_grx = (GRegex *) vtt_rx_once.retval;
 
   if (g_regex_match (mdvd_grx, match_str, 0, NULL)) {
     GST_LOG ("MicroDVD (frame based) format detected");
@@ -1294,6 +1453,10 @@ gst_sub_parse_data_format_autodetect (gchar * match_str)
     GST_LOG ("DKS (time based) format detected");
     return GST_SUB_PARSE_FORMAT_DKS;
   }
+  if (g_regex_match (vtt_grx, match_str, 0, NULL) == TRUE) {
+    GST_LOG ("WebVTT (time based) format detected");
+    return GST_SUB_PARSE_FORMAT_VTT;
+  }
 
   if (!strncmp (match_str, "FORMAT=TIME", 11)) {
     GST_LOG ("MPSub (time based) format detected");
@@ -1383,6 +1546,8 @@ gst_sub_parse_format_autodetect (GstSubParse * self)
       return gst_caps_new_simple ("text/x-raw",
           "format", G_TYPE_STRING, "pango-markup", NULL);
     case GST_SUB_PARSE_FORMAT_SUBRIP:
+      self->state.allowed_tags = (gpointer) allowed_srt_tags;
+      self->state.allows_tag_attributes = FALSE;
       self->parse_line = parse_subrip;
       return gst_caps_new_simple ("text/x-raw",
           "format", G_TYPE_STRING, "pango-markup", NULL);
@@ -1408,6 +1573,12 @@ gst_sub_parse_format_autodetect (GstSubParse * self)
       self->parse_line = parse_dks;
       return gst_caps_new_simple ("text/x-raw",
           "format", G_TYPE_STRING, "utf8", NULL);
+    case GST_SUB_PARSE_FORMAT_VTT:
+      self->state.allowed_tags = (gpointer) allowed_vtt_tags;
+      self->state.allows_tag_attributes = TRUE;
+      self->parse_line = parse_webvtt;
+      return gst_caps_new_simple ("text/x-raw",
+          "format", G_TYPE_STRING, "pango-markup", NULL);
     case GST_SUB_PARSE_FORMAT_SUBVIEWER:
       self->parse_line = parse_subviewer;
       return gst_caps_new_simple ("text/x-raw",
@@ -1572,6 +1743,8 @@ handle_buffer (GstSubParse * self, GstBuffer * buf)
           GST_TIME_FORMAT, subtitle, GST_TIME_ARGS (self->state.start_time),
           GST_TIME_ARGS (self->state.duration));
 
+      g_free (self->state.vertical);
+      g_free (self->state.alignment);
       ret = gst_pad_push (self->srcpad, buf);
 
       /* move this forward (the tmplayer parser needs this) */
@@ -1738,6 +1911,9 @@ static GstStaticCaps smi_caps = GST_STATIC_CAPS ("application/x-subtitle-sami");
 static GstStaticCaps dks_caps = GST_STATIC_CAPS ("application/x-subtitle-dks");
 #define DKS_CAPS (gst_static_caps_get (&dks_caps))
 
+static GstStaticCaps vtt_caps = GST_STATIC_CAPS ("application/x-subtitle-vtt");
+#define VTT_CAPS (gst_static_caps_get (&vtt_caps))
+
 static GstStaticCaps qttext_caps =
 GST_STATIC_CAPS ("application/x-subtitle-qttext");
 #define QTTEXT_CAPS (gst_static_caps_get (&qttext_caps))
@@ -1848,6 +2024,9 @@ gst_subparse_type_find (GstTypeFind * tf, gpointer private)
     case GST_SUB_PARSE_FORMAT_LRC:
       GST_DEBUG ("LRC format detected");
       caps = LRC_CAPS;
+    case GST_SUB_PARSE_FORMAT_VTT:
+      GST_DEBUG ("WebVTT format detected");
+      caps = VTT_CAPS;
       break;
     default:
     case GST_SUB_PARSE_FORMAT_UNKNOWN:
@@ -1865,8 +2044,8 @@ plugin_init (GstPlugin * plugin)
   GST_DEBUG_CATEGORY_INIT (sub_parse_debug, "subparse", 0, ".sub parser");
 
   if (!gst_type_find_register (plugin, "subparse_typefind", GST_RANK_MARGINAL,
-          gst_subparse_type_find, "srt,sub,mpsub,mdvd,smi,txt,dks", SUB_CAPS,
-          NULL, NULL))
+          gst_subparse_type_find, "srt,sub,mpsub,mdvd,smi,txt,dks,vtt",
+          SUB_CAPS, NULL, NULL))
     return FALSE;
 
   if (!gst_element_register (plugin, "subparse",
diff --git a/gst/subparse/gstsubparse.h b/gst/subparse/gstsubparse.h
index 012aee60d..64aab3463 100644
--- a/gst/subparse/gstsubparse.h
+++ b/gst/subparse/gstsubparse.h
@@ -56,7 +56,8 @@ typedef enum
   GST_SUB_PARSE_FORMAT_SUBVIEWER = 7,
   GST_SUB_PARSE_FORMAT_DKS = 8,
   GST_SUB_PARSE_FORMAT_QTTEXT = 9,
-  GST_SUB_PARSE_FORMAT_LRC = 10
+  GST_SUB_PARSE_FORMAT_LRC = 10,
+  GST_SUB_PARSE_FORMAT_VTT = 11
 } GstSubParseFormat;
 
 typedef struct {
@@ -69,6 +70,14 @@ typedef struct {
   gpointer user_data;
   gboolean have_internal_fps; /* If TRUE don't overwrite fps by property */
   gint fps_n, fps_d;     /* used by frame based parsers */
+  guint8 line_position;          /* percent value */
+  gint line_number;              /* line number, can be positive or negative */
+  guint8 text_position;          /* percent value */
+  guint8 text_size;          /* percent value */
+  gchar *vertical;        /* "", "vertical", "vertical-lr" */
+  gchar *alignment;       /* "", "start", "middle", "end" */
+  gconstpointer allowed_tags; /* list of markup tags allowed in the cue text. */
+  gboolean allows_tag_attributes;
 } ParserState;
 
 typedef gchar* (*Parser) (ParserState *state, const gchar *line);