summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAkira TAGOH <akira@tagoh.org>2015-12-03 18:47:49 +0900
committerAkira TAGOH <akira@tagoh.org>2015-12-03 18:47:49 +0900
commitdcf359902feb34e0f8f5f3fe5e3dae719af5937e (patch)
tree19ef6a8077f8771b9c076d2ba98f563417d0e864
parent0207866d389f8da5739823e311fac16c9f52c33c (diff)
Support the language alias for the canonicalization
-rw-r--r--data/Makefile.am1
-rw-r--r--liblangtag/lt-tag.c215
-rw-r--r--liblangtag/lt-tag.h22
-rw-r--r--liblangtag/lt-xml.c5
-rw-r--r--liblangtag/lt-xml.h3
-rw-r--r--tests/check-tag.c15
6 files changed, 241 insertions, 20 deletions
diff --git a/data/Makefile.am b/data/Makefile.am
index 9ec3d34..11e20f3 100644
--- a/data/Makefile.am
+++ b/data/Makefile.am
@@ -45,6 +45,7 @@ bcp47_xml_files = \
supplemental_xml_files = \
common/supplemental/likelySubtags.xml \
common/supplemental/supplementalData.xml \
+ common/supplemental/supplementalMetadata.xml \
$(NULL)
stamp_files = \
stamp-core-zip \
diff --git a/liblangtag/lt-tag.c b/liblangtag/lt-tag.c
index 56b8a46..729f50d 100644
--- a/liblangtag/lt-tag.c
+++ b/liblangtag/lt-tag.c
@@ -53,6 +53,7 @@ struct _lt_tag_t {
lt_mem_t parent;
int32_t wildcard_map;
lt_tag_state_t state;
+ int tag_string_filter;
lt_string_t *tag_string;
lt_lang_t *language;
lt_extlang_t *extlang;
@@ -1241,6 +1242,147 @@ _lt_tag_convert_from_locale_string(const char *locale,
return tag;
}
+static lt_tag_t *
+_lt_tag_canonicalize_alias(lt_tag_t *tag,
+ lt_error_t **error)
+{
+ lt_xml_t *xml;
+ lt_string_t *s = NULL;
+ lt_tag_t *retval = NULL;
+ lt_script_t *script;
+ lt_region_t *region;
+ lt_list_t *l, *ll;
+ lt_extension_t *extension;
+ lt_error_t *err = NULL;
+ xmlDocPtr doc;
+ xmlXPathContextPtr xctxt = NULL;
+ xmlXPathObjectPtr xobj = NULL;
+ xmlNodePtr ent;
+ char *q;
+ const char *tag_string;
+ int n, i, retry, filter;
+ xmlChar *rep;
+ size_t len;
+
+ xml = lt_xml_new();
+ doc = lt_xml_get_cldr(xml, LT_XML_CLDR_SUPPLEMENTAL_SUPPLEMENTAL_METADATA);
+ xctxt = xmlXPathNewContext(doc);
+ if (!xctxt) {
+ lt_error_set(&err, LT_ERR_OOM,
+ "Unable to create an instance of xmlXPathContextPtr.");
+ goto bail;
+ }
+ for (retry = 4, n = 0; retry > 0; retry--) {
+ switch (retry) {
+ case 1:
+ filter = LT_TAG_FILTER_LANGUAGE|LT_TAG_FILTER_GRANDFATHERED;
+ break;
+ case 2:
+ filter = LT_TAG_FILTER_LANGUAGE|LT_TAG_FILTER_SCRIPT|LT_TAG_FILTER_GRANDFATHERED;
+ break;
+ case 3:
+ filter = LT_TAG_FILTER_LANGUAGE|LT_TAG_FILTER_REGION|LT_TAG_FILTER_GRANDFATHERED;
+ break;
+ default:
+ filter = LT_TAG_FILTER_LANGUAGE|LT_TAG_FILTER_SCRIPT|LT_TAG_FILTER_REGION|LT_TAG_FILTER_GRANDFATHERED;
+ break;
+ }
+ tag_string = lt_tag_get_string_with_filter(tag, filter);
+ lt_debug(LT_MSGCAT_TAG, "alias lookup: %s", tag_string);
+ /* explicitly exclude 'macrolanguage' alias so that we deal with it in the canonicalization for extlang form */
+ q = lt_strdup_printf("/supplementalData/metadata/alias/languageAlias[translate(@type,'_','-') = '%s' and @reason != 'macrolanguage']",
+ tag_string);
+ xobj = xmlXPathEvalExpression((const xmlChar *)q, xctxt);
+ free(q);
+ if (!xobj) {
+ lt_error_set(&err, LT_ERR_FAIL_ON_XML,
+ "No valid elements for %s",
+ doc->name);
+ goto bail;
+ }
+ n = xmlXPathNodeSetGetLength(xobj->nodesetval);
+ if (n != 0)
+ break;
+ xmlXPathFreeObject(xobj);
+ xobj = NULL;
+ }
+ if (n == 0)
+ goto bail;
+ if (n > 1)
+ lt_warning("Multiple subtag data to be canonicalized against alias metadata: %s: %d",
+ tag_string, n);
+ ent = xmlXPathNodeSetItem(xobj->nodesetval, 0);
+ if (!ent) {
+ lt_error_set(&err, LT_ERR_FAIL_ON_XML,
+ "Unable to obtain the xml node via XPath.");
+ goto bail;
+ }
+ rep = xmlGetProp(ent, (const xmlChar *)"replacement");
+ s = lt_string_new((const char *)rep);
+ xmlFree(rep);
+ lt_debug(LT_MSGCAT_TAG, "alias replacement: %s", lt_string_value(s));
+ len = lt_string_length(s);
+ for (i = 0; i < len; i++) {
+ if (lt_string_at(s, i) == '_')
+ lt_string_replace_c(s, i, '-');
+ }
+ retval = lt_tag_new();
+ if (!lt_tag_parse(retval, lt_string_value(s), &err)) {
+ lt_tag_unref(retval);
+ retval = NULL;
+ goto bail;
+ }
+
+ switch (retry) {
+ case 1:
+ case 3:
+ script = (lt_script_t *)lt_tag_get_script(tag);
+ if (script)
+ lt_tag_set_script(retval, lt_script_ref(script));
+ if (retry == 3)
+ goto copies_variants;
+ case 2:
+ region = (lt_region_t *)lt_tag_get_region(tag);
+ if (region)
+ lt_tag_set_region(retval, lt_region_ref(region));
+ default:
+ copies_variants:
+ l = (lt_list_t *)lt_tag_get_variants(tag);
+ for (ll = l; ll; ll = lt_list_next(ll)) {
+ lt_tag_set_variant(retval, lt_variant_ref(lt_list_value(ll)));
+ }
+ break;
+ }
+ extension = (lt_extension_t *)lt_tag_get_extension(tag);
+ if (extension)
+ lt_tag_set_extension(retval, lt_extension_ref(extension));
+ if (tag->privateuse && lt_string_length(tag->privateuse) > 0)
+ lt_string_append(retval->privateuse, lt_string_value(tag->privateuse));
+ if (lt_tag_compare(tag, retval)) {
+ lt_tag_unref(retval);
+ retval = NULL;
+ }
+
+ bail:
+ if (s)
+ lt_string_unref(s);
+ if (xobj)
+ xmlXPathFreeObject(xobj);
+ if (xctxt)
+ xmlXPathFreeContext(xctxt);
+ if (xml)
+ lt_xml_unref(xml);
+ if (lt_error_is_set(err, LT_ERR_ANY)) {
+ if (error)
+ *error = lt_error_ref(err);
+ else
+ lt_error_print(err, LT_ERR_ANY);
+ lt_error_unref(err);
+ }
+
+ return retval;
+}
+
static char *
_lt_tag_canonicalize(lt_tag_t *tag,
lt_bool_t extlang_form,
@@ -1252,10 +1394,13 @@ _lt_tag_canonicalize(lt_tag_t *tag,
lt_list_t *l;
lt_redundant_db_t *rdb = NULL;
lt_redundant_t *r = NULL;
- lt_tag_t *ctag = NULL;
+ lt_tag_t *ctag = NULL, *alias;
lt_return_val_if_fail (tag != NULL, NULL);
+ alias = _lt_tag_canonicalize_alias(tag, &err);
+ if (alias)
+ tag = alias;
string = lt_string_new(NULL);
if (tag->grandfathered) {
lt_string_append(string, lt_grandfathered_get_better_tag(tag->grandfathered));
@@ -1372,6 +1517,8 @@ _lt_tag_canonicalize(lt_tag_t *tag,
"No tag to convert.");
}
bail1:
+ if (alias)
+ lt_tag_unref(alias);
if (ctag)
lt_tag_unref(ctag);
if (rdb)
@@ -1435,6 +1582,7 @@ lt_tag_new(void)
if (retval) {
retval->state = STATE_NONE;
+ retval->tag_string_filter = LT_TAG_FILTER_NONE;
retval->privateuse = lt_string_new(NULL);
lt_mem_add_ref(&retval->parent, retval->privateuse,
(lt_destroy_func_t)lt_string_unref);
@@ -1684,42 +1832,57 @@ lt_tag_truncate(lt_tag_t *tag,
}
/**
- * lt_tag_get_string:
+ * lt_tag_get_string_with_filter:
* @tag: a #lt_tag_t.
+ * @filter: a binary count sequence of #lt_tag_filter_t.
*
- * Obtains a language tag in string.
+ * Obtains a language tag in string against @filter.
*
* Returns: a language tag string.
*/
const char *
-lt_tag_get_string(lt_tag_t *tag)
+lt_tag_get_string_with_filter(lt_tag_t *tag,
+ int filter)
{
lt_list_t *l;
- if (tag->tag_string)
+ if (tag->tag_string_filter != filter)
+ lt_tag_free_tag_string(tag);
+ else if (tag->tag_string)
return lt_string_value(tag->tag_string);
- if (tag->grandfathered)
- lt_tag_add_tag_string(tag, lt_grandfathered_get_tag(tag->grandfathered));
- else if (tag->language) {
- lt_tag_add_tag_string(tag, lt_lang_get_tag(tag->language));
+ tag->tag_string_filter = filter;
+ if (tag->grandfathered) {
+ if ((filter & LT_TAG_FILTER_GRANDFATHERED))
+ lt_tag_add_tag_string(tag, lt_grandfathered_get_tag(tag->grandfathered));
+ } else if (tag->language) {
+ if ((filter & LT_TAG_FILTER_LANGUAGE))
+ lt_tag_add_tag_string(tag, lt_lang_get_tag(tag->language));
if (tag->extlang)
- lt_tag_add_tag_string(tag, lt_extlang_get_tag(tag->extlang));
+ if ((filter & LT_TAG_FILTER_EXTLANG))
+ lt_tag_add_tag_string(tag, lt_extlang_get_tag(tag->extlang));
if (tag->script)
- lt_tag_add_tag_string(tag, lt_script_get_tag(tag->script));
+ if ((filter & LT_TAG_FILTER_SCRIPT))
+ lt_tag_add_tag_string(tag, lt_script_get_tag(tag->script));
if (tag->region)
- lt_tag_add_tag_string(tag, lt_region_get_tag(tag->region));
- l = tag->variants;
- while (l != NULL) {
- lt_tag_add_tag_string(tag, lt_variant_get_tag(lt_list_value(l)));
- l = lt_list_next(l);
+ if ((filter & LT_TAG_FILTER_REGION))
+ lt_tag_add_tag_string(tag, lt_region_get_tag(tag->region));
+ if ((filter & LT_TAG_FILTER_VARIANT)) {
+ l = tag->variants;
+ while (l != NULL) {
+ lt_tag_add_tag_string(tag, lt_variant_get_tag(lt_list_value(l)));
+ l = lt_list_next(l);
+ }
}
if (tag->extension)
- lt_tag_add_tag_string(tag, lt_extension_get_tag(tag->extension));
+ if ((filter & LT_TAG_FILTER_EXTENSION))
+ lt_tag_add_tag_string(tag, lt_extension_get_tag(tag->extension));
if (tag->privateuse && lt_string_length(tag->privateuse) > 0)
- lt_tag_add_tag_string(tag, lt_string_value(tag->privateuse));
+ if ((filter & LT_TAG_FILTER_PRIVATEUSE))
+ lt_tag_add_tag_string(tag, lt_string_value(tag->privateuse));
} else if (tag->privateuse && lt_string_length(tag->privateuse) > 0) {
- lt_tag_add_tag_string(tag, lt_string_value(tag->privateuse));
+ if ((filter & LT_TAG_FILTER_PRIVATEUSE))
+ lt_tag_add_tag_string(tag, lt_string_value(tag->privateuse));
} else {
return NULL;
}
@@ -1728,6 +1891,20 @@ lt_tag_get_string(lt_tag_t *tag)
}
/**
+ * lt_tag_get_string:
+ * @tag: a #lt_tag_t.
+ *
+ * Obtains a language tag in string.
+ *
+ * Returns: a language tag string.
+ */
+const char *
+lt_tag_get_string(lt_tag_t *tag)
+{
+ return lt_tag_get_string_with_filter(tag, LT_TAG_FILTER_ALL);
+}
+
+/**
* lt_tag_canonicalize:
* @tag: a #lt_tag_t.
* @error: (allow-none): a #lt_error_t or %NULL.
diff --git a/liblangtag/lt-tag.h b/liblangtag/lt-tag.h
index fbeeab7..0637652 100644
--- a/liblangtag/lt-tag.h
+++ b/liblangtag/lt-tag.h
@@ -37,6 +37,26 @@ LT_BEGIN_DECLS
*/
typedef struct _lt_tag_t lt_tag_t;
+/**
+ * lt_tag_filter_t:
+ *
+ * This is used as a binary count sequence with lt_tag_get_string_with_filter()
+ * to allow only given tags to be the outcome.
+ */
+typedef enum _lt_tag_filter_t lt_tag_filter_t;
+
+enum _lt_tag_filter_t {
+ LT_TAG_FILTER_NONE = 0,
+ LT_TAG_FILTER_LANGUAGE = 1 << 0,
+ LT_TAG_FILTER_EXTLANG = 1 << 1,
+ LT_TAG_FILTER_SCRIPT = 1 << 2,
+ LT_TAG_FILTER_REGION = 1 << 3,
+ LT_TAG_FILTER_VARIANT = 1 << 4,
+ LT_TAG_FILTER_EXTENSION = 1 << 5,
+ LT_TAG_FILTER_PRIVATEUSE = 1 << 6,
+ LT_TAG_FILTER_GRANDFATHERED = 1 << 7,
+ LT_TAG_FILTER_ALL = -1
+};
lt_tag_t *lt_tag_new (void);
lt_tag_t *lt_tag_ref (lt_tag_t *tag);
@@ -51,6 +71,8 @@ void lt_tag_clear (lt_tag_t *ta
lt_tag_t *lt_tag_copy (const lt_tag_t *tag);
lt_bool_t lt_tag_truncate (lt_tag_t *tag,
lt_error_t **error);
+const char *lt_tag_get_string_with_filter (lt_tag_t *tag,
+ int filter);
const char *lt_tag_get_string (lt_tag_t *tag);
char *lt_tag_canonicalize (lt_tag_t *tag,
lt_error_t **error);
diff --git a/liblangtag/lt-xml.c b/liblangtag/lt-xml.c
index 0ee34a0..3baab53 100644
--- a/liblangtag/lt-xml.c
+++ b/liblangtag/lt-xml.c
@@ -39,6 +39,7 @@ struct _lt_xml_t {
xmlDocPtr cldr_bcp47_variant;
xmlDocPtr cldr_supplemental_likelysubtags;
xmlDocPtr cldr_supplemental_supplementaldata;
+ xmlDocPtr cldr_supplemental_supplementalmetadata;
};
static lt_xml_t *__xml = NULL;
@@ -399,6 +400,10 @@ lt_xml_new(void)
&__xml->cldr_supplemental_supplementaldata,
&err))
goto bail;
+ if (!lt_xml_read_cldr_supplemental(__xml, "supplementalMetadata.xml",
+ &__xml->cldr_supplemental_supplementalmetadata,
+ &err))
+ goto bail;
}
bail:
diff --git a/liblangtag/lt-xml.h b/liblangtag/lt-xml.h
index 917806a..065ac9e 100644
--- a/liblangtag/lt-xml.h
+++ b/liblangtag/lt-xml.h
@@ -33,8 +33,9 @@ typedef enum _lt_xml_cldr_t {
LT_XML_CLDR_DUMMY1 = 100,
LT_XML_CLDR_SUPPLEMENTAL_LIKELY_SUBTAGS,
LT_XML_CLDR_SUPPLEMENTAL_SUPPLEMENTAL_DATA,
+ LT_XML_CLDR_SUPPLEMENTAL_SUPPLEMENTAL_METADATA,
LT_XML_CLDR_SUPPLEMENTAL_BEGIN = LT_XML_CLDR_SUPPLEMENTAL_LIKELY_SUBTAGS,
- LT_XML_CLDR_SUPPLEMENTAL_END = LT_XML_CLDR_SUPPLEMENTAL_SUPPLEMENTAL_DATA,
+ LT_XML_CLDR_SUPPLEMENTAL_END = LT_XML_CLDR_SUPPLEMENTAL_SUPPLEMENTAL_METADATA,
LT_XML_CLDR_END
} lt_xml_cldr_t;
diff --git a/tests/check-tag.c b/tests/check-tag.c
index 976303e..df9d581 100644
--- a/tests/check-tag.c
+++ b/tests/check-tag.c
@@ -184,6 +184,21 @@ TDEF (lt_tag_canonicalize) {
fail_unless(s != NULL, "Unable to be canonicalize.");
fail_unless(lt_strcmp0(s, "bzs") == 0, "Unexpected result to be canonicalized.");
free(s);
+ fail_unless(lt_tag_parse(t1, "sh-Arab-AQ", NULL), "should be valid langtag.");
+ s = lt_tag_canonicalize(t1, NULL);
+ fail_unless(s != NULL, "Unable to canonicalize.");
+ fail_unless(lt_strcmp0(s, "sr-Arab-AQ") == 0, "Unexpected result to be canonicalized.");
+ free(s);
+ fail_unless(lt_tag_parse(t1, "sh", NULL), "should be valid langtag.");
+ s = lt_tag_canonicalize(t1, NULL);
+ fail_unless(s != NULL, "Unable to be canonicalize.");
+ fail_unless(lt_strcmp0(s, "sr-Latn") == 0, "Unexpected result to be canonicalized.");
+ free(s);
+ fail_unless(lt_tag_parse(t1, "mo", NULL), "should be valid langtag.");
+ s = lt_tag_canonicalize(t1, NULL);
+ fail_unless(s != NULL, "Unable to be canonicalize.");
+ fail_unless(lt_strcmp0(s, "ro-MD") == 0, "Unexpected result to be canonicalized.");
+ free(s);
lt_tag_unref(t1);
} TEND