diff options
author | Akira TAGOH <akira@tagoh.org> | 2015-12-03 18:47:49 +0900 |
---|---|---|
committer | Akira TAGOH <akira@tagoh.org> | 2015-12-03 18:47:49 +0900 |
commit | dcf359902feb34e0f8f5f3fe5e3dae719af5937e (patch) | |
tree | 19ef6a8077f8771b9c076d2ba98f563417d0e864 | |
parent | 0207866d389f8da5739823e311fac16c9f52c33c (diff) |
Support the language alias for the canonicalization
-rw-r--r-- | data/Makefile.am | 1 | ||||
-rw-r--r-- | liblangtag/lt-tag.c | 215 | ||||
-rw-r--r-- | liblangtag/lt-tag.h | 22 | ||||
-rw-r--r-- | liblangtag/lt-xml.c | 5 | ||||
-rw-r--r-- | liblangtag/lt-xml.h | 3 | ||||
-rw-r--r-- | tests/check-tag.c | 15 |
6 files changed, 241 insertions, 20 deletions
diff --git a/data/Makefile.am b/data/Makefile.am index 9ec3d34..11e20f3 100644 --- a/data/Makefile.am +++ b/data/Makefile.am @@ -45,6 +45,7 @@ bcp47_xml_files = \ supplemental_xml_files = \ common/supplemental/likelySubtags.xml \ common/supplemental/supplementalData.xml \ + common/supplemental/supplementalMetadata.xml \ $(NULL) stamp_files = \ stamp-core-zip \ diff --git a/liblangtag/lt-tag.c b/liblangtag/lt-tag.c index 56b8a46..729f50d 100644 --- a/liblangtag/lt-tag.c +++ b/liblangtag/lt-tag.c @@ -53,6 +53,7 @@ struct _lt_tag_t { lt_mem_t parent; int32_t wildcard_map; lt_tag_state_t state; + int tag_string_filter; lt_string_t *tag_string; lt_lang_t *language; lt_extlang_t *extlang; @@ -1241,6 +1242,147 @@ _lt_tag_convert_from_locale_string(const char *locale, return tag; } +static lt_tag_t * +_lt_tag_canonicalize_alias(lt_tag_t *tag, + lt_error_t **error) +{ + lt_xml_t *xml; + lt_string_t *s = NULL; + lt_tag_t *retval = NULL; + lt_script_t *script; + lt_region_t *region; + lt_list_t *l, *ll; + lt_extension_t *extension; + lt_error_t *err = NULL; + xmlDocPtr doc; + xmlXPathContextPtr xctxt = NULL; + xmlXPathObjectPtr xobj = NULL; + xmlNodePtr ent; + char *q; + const char *tag_string; + int n, i, retry, filter; + xmlChar *rep; + size_t len; + + xml = lt_xml_new(); + doc = lt_xml_get_cldr(xml, LT_XML_CLDR_SUPPLEMENTAL_SUPPLEMENTAL_METADATA); + xctxt = xmlXPathNewContext(doc); + if (!xctxt) { + lt_error_set(&err, LT_ERR_OOM, + "Unable to create an instance of xmlXPathContextPtr."); + goto bail; + } + for (retry = 4, n = 0; retry > 0; retry--) { + switch (retry) { + case 1: + filter = LT_TAG_FILTER_LANGUAGE|LT_TAG_FILTER_GRANDFATHERED; + break; + case 2: + filter = LT_TAG_FILTER_LANGUAGE|LT_TAG_FILTER_SCRIPT|LT_TAG_FILTER_GRANDFATHERED; + break; + case 3: + filter = LT_TAG_FILTER_LANGUAGE|LT_TAG_FILTER_REGION|LT_TAG_FILTER_GRANDFATHERED; + break; + default: + filter = LT_TAG_FILTER_LANGUAGE|LT_TAG_FILTER_SCRIPT|LT_TAG_FILTER_REGION|LT_TAG_FILTER_GRANDFATHERED; + break; + } + tag_string = lt_tag_get_string_with_filter(tag, filter); + lt_debug(LT_MSGCAT_TAG, "alias lookup: %s", tag_string); + /* explicitly exclude 'macrolanguage' alias so that we deal with it in the canonicalization for extlang form */ + q = lt_strdup_printf("/supplementalData/metadata/alias/languageAlias[translate(@type,'_','-') = '%s' and @reason != 'macrolanguage']", + tag_string); + xobj = xmlXPathEvalExpression((const xmlChar *)q, xctxt); + free(q); + if (!xobj) { + lt_error_set(&err, LT_ERR_FAIL_ON_XML, + "No valid elements for %s", + doc->name); + goto bail; + } + n = xmlXPathNodeSetGetLength(xobj->nodesetval); + if (n != 0) + break; + xmlXPathFreeObject(xobj); + xobj = NULL; + } + if (n == 0) + goto bail; + if (n > 1) + lt_warning("Multiple subtag data to be canonicalized against alias metadata: %s: %d", + tag_string, n); + ent = xmlXPathNodeSetItem(xobj->nodesetval, 0); + if (!ent) { + lt_error_set(&err, LT_ERR_FAIL_ON_XML, + "Unable to obtain the xml node via XPath."); + goto bail; + } + rep = xmlGetProp(ent, (const xmlChar *)"replacement"); + s = lt_string_new((const char *)rep); + xmlFree(rep); + lt_debug(LT_MSGCAT_TAG, "alias replacement: %s", lt_string_value(s)); + len = lt_string_length(s); + for (i = 0; i < len; i++) { + if (lt_string_at(s, i) == '_') + lt_string_replace_c(s, i, '-'); + } + retval = lt_tag_new(); + if (!lt_tag_parse(retval, lt_string_value(s), &err)) { + lt_tag_unref(retval); + retval = NULL; + goto bail; + } + + switch (retry) { + case 1: + case 3: + script = (lt_script_t *)lt_tag_get_script(tag); + if (script) + lt_tag_set_script(retval, lt_script_ref(script)); + if (retry == 3) + goto copies_variants; + case 2: + region = (lt_region_t *)lt_tag_get_region(tag); + if (region) + lt_tag_set_region(retval, lt_region_ref(region)); + default: + copies_variants: + l = (lt_list_t *)lt_tag_get_variants(tag); + for (ll = l; ll; ll = lt_list_next(ll)) { + lt_tag_set_variant(retval, lt_variant_ref(lt_list_value(ll))); + } + break; + } + extension = (lt_extension_t *)lt_tag_get_extension(tag); + if (extension) + lt_tag_set_extension(retval, lt_extension_ref(extension)); + if (tag->privateuse && lt_string_length(tag->privateuse) > 0) + lt_string_append(retval->privateuse, lt_string_value(tag->privateuse)); + if (lt_tag_compare(tag, retval)) { + lt_tag_unref(retval); + retval = NULL; + } + + bail: + if (s) + lt_string_unref(s); + if (xobj) + xmlXPathFreeObject(xobj); + if (xctxt) + xmlXPathFreeContext(xctxt); + if (xml) + lt_xml_unref(xml); + if (lt_error_is_set(err, LT_ERR_ANY)) { + if (error) + *error = lt_error_ref(err); + else + lt_error_print(err, LT_ERR_ANY); + lt_error_unref(err); + } + + return retval; +} + static char * _lt_tag_canonicalize(lt_tag_t *tag, lt_bool_t extlang_form, @@ -1252,10 +1394,13 @@ _lt_tag_canonicalize(lt_tag_t *tag, lt_list_t *l; lt_redundant_db_t *rdb = NULL; lt_redundant_t *r = NULL; - lt_tag_t *ctag = NULL; + lt_tag_t *ctag = NULL, *alias; lt_return_val_if_fail (tag != NULL, NULL); + alias = _lt_tag_canonicalize_alias(tag, &err); + if (alias) + tag = alias; string = lt_string_new(NULL); if (tag->grandfathered) { lt_string_append(string, lt_grandfathered_get_better_tag(tag->grandfathered)); @@ -1372,6 +1517,8 @@ _lt_tag_canonicalize(lt_tag_t *tag, "No tag to convert."); } bail1: + if (alias) + lt_tag_unref(alias); if (ctag) lt_tag_unref(ctag); if (rdb) @@ -1435,6 +1582,7 @@ lt_tag_new(void) if (retval) { retval->state = STATE_NONE; + retval->tag_string_filter = LT_TAG_FILTER_NONE; retval->privateuse = lt_string_new(NULL); lt_mem_add_ref(&retval->parent, retval->privateuse, (lt_destroy_func_t)lt_string_unref); @@ -1684,42 +1832,57 @@ lt_tag_truncate(lt_tag_t *tag, } /** - * lt_tag_get_string: + * lt_tag_get_string_with_filter: * @tag: a #lt_tag_t. + * @filter: a binary count sequence of #lt_tag_filter_t. * - * Obtains a language tag in string. + * Obtains a language tag in string against @filter. * * Returns: a language tag string. */ const char * -lt_tag_get_string(lt_tag_t *tag) +lt_tag_get_string_with_filter(lt_tag_t *tag, + int filter) { lt_list_t *l; - if (tag->tag_string) + if (tag->tag_string_filter != filter) + lt_tag_free_tag_string(tag); + else if (tag->tag_string) return lt_string_value(tag->tag_string); - if (tag->grandfathered) - lt_tag_add_tag_string(tag, lt_grandfathered_get_tag(tag->grandfathered)); - else if (tag->language) { - lt_tag_add_tag_string(tag, lt_lang_get_tag(tag->language)); + tag->tag_string_filter = filter; + if (tag->grandfathered) { + if ((filter & LT_TAG_FILTER_GRANDFATHERED)) + lt_tag_add_tag_string(tag, lt_grandfathered_get_tag(tag->grandfathered)); + } else if (tag->language) { + if ((filter & LT_TAG_FILTER_LANGUAGE)) + lt_tag_add_tag_string(tag, lt_lang_get_tag(tag->language)); if (tag->extlang) - lt_tag_add_tag_string(tag, lt_extlang_get_tag(tag->extlang)); + if ((filter & LT_TAG_FILTER_EXTLANG)) + lt_tag_add_tag_string(tag, lt_extlang_get_tag(tag->extlang)); if (tag->script) - lt_tag_add_tag_string(tag, lt_script_get_tag(tag->script)); + if ((filter & LT_TAG_FILTER_SCRIPT)) + lt_tag_add_tag_string(tag, lt_script_get_tag(tag->script)); if (tag->region) - lt_tag_add_tag_string(tag, lt_region_get_tag(tag->region)); - l = tag->variants; - while (l != NULL) { - lt_tag_add_tag_string(tag, lt_variant_get_tag(lt_list_value(l))); - l = lt_list_next(l); + if ((filter & LT_TAG_FILTER_REGION)) + lt_tag_add_tag_string(tag, lt_region_get_tag(tag->region)); + if ((filter & LT_TAG_FILTER_VARIANT)) { + l = tag->variants; + while (l != NULL) { + lt_tag_add_tag_string(tag, lt_variant_get_tag(lt_list_value(l))); + l = lt_list_next(l); + } } if (tag->extension) - lt_tag_add_tag_string(tag, lt_extension_get_tag(tag->extension)); + if ((filter & LT_TAG_FILTER_EXTENSION)) + lt_tag_add_tag_string(tag, lt_extension_get_tag(tag->extension)); if (tag->privateuse && lt_string_length(tag->privateuse) > 0) - lt_tag_add_tag_string(tag, lt_string_value(tag->privateuse)); + if ((filter & LT_TAG_FILTER_PRIVATEUSE)) + lt_tag_add_tag_string(tag, lt_string_value(tag->privateuse)); } else if (tag->privateuse && lt_string_length(tag->privateuse) > 0) { - lt_tag_add_tag_string(tag, lt_string_value(tag->privateuse)); + if ((filter & LT_TAG_FILTER_PRIVATEUSE)) + lt_tag_add_tag_string(tag, lt_string_value(tag->privateuse)); } else { return NULL; } @@ -1728,6 +1891,20 @@ lt_tag_get_string(lt_tag_t *tag) } /** + * lt_tag_get_string: + * @tag: a #lt_tag_t. + * + * Obtains a language tag in string. + * + * Returns: a language tag string. + */ +const char * +lt_tag_get_string(lt_tag_t *tag) +{ + return lt_tag_get_string_with_filter(tag, LT_TAG_FILTER_ALL); +} + +/** * lt_tag_canonicalize: * @tag: a #lt_tag_t. * @error: (allow-none): a #lt_error_t or %NULL. diff --git a/liblangtag/lt-tag.h b/liblangtag/lt-tag.h index fbeeab7..0637652 100644 --- a/liblangtag/lt-tag.h +++ b/liblangtag/lt-tag.h @@ -37,6 +37,26 @@ LT_BEGIN_DECLS */ typedef struct _lt_tag_t lt_tag_t; +/** + * lt_tag_filter_t: + * + * This is used as a binary count sequence with lt_tag_get_string_with_filter() + * to allow only given tags to be the outcome. + */ +typedef enum _lt_tag_filter_t lt_tag_filter_t; + +enum _lt_tag_filter_t { + LT_TAG_FILTER_NONE = 0, + LT_TAG_FILTER_LANGUAGE = 1 << 0, + LT_TAG_FILTER_EXTLANG = 1 << 1, + LT_TAG_FILTER_SCRIPT = 1 << 2, + LT_TAG_FILTER_REGION = 1 << 3, + LT_TAG_FILTER_VARIANT = 1 << 4, + LT_TAG_FILTER_EXTENSION = 1 << 5, + LT_TAG_FILTER_PRIVATEUSE = 1 << 6, + LT_TAG_FILTER_GRANDFATHERED = 1 << 7, + LT_TAG_FILTER_ALL = -1 +}; lt_tag_t *lt_tag_new (void); lt_tag_t *lt_tag_ref (lt_tag_t *tag); @@ -51,6 +71,8 @@ void lt_tag_clear (lt_tag_t *ta lt_tag_t *lt_tag_copy (const lt_tag_t *tag); lt_bool_t lt_tag_truncate (lt_tag_t *tag, lt_error_t **error); +const char *lt_tag_get_string_with_filter (lt_tag_t *tag, + int filter); const char *lt_tag_get_string (lt_tag_t *tag); char *lt_tag_canonicalize (lt_tag_t *tag, lt_error_t **error); diff --git a/liblangtag/lt-xml.c b/liblangtag/lt-xml.c index 0ee34a0..3baab53 100644 --- a/liblangtag/lt-xml.c +++ b/liblangtag/lt-xml.c @@ -39,6 +39,7 @@ struct _lt_xml_t { xmlDocPtr cldr_bcp47_variant; xmlDocPtr cldr_supplemental_likelysubtags; xmlDocPtr cldr_supplemental_supplementaldata; + xmlDocPtr cldr_supplemental_supplementalmetadata; }; static lt_xml_t *__xml = NULL; @@ -399,6 +400,10 @@ lt_xml_new(void) &__xml->cldr_supplemental_supplementaldata, &err)) goto bail; + if (!lt_xml_read_cldr_supplemental(__xml, "supplementalMetadata.xml", + &__xml->cldr_supplemental_supplementalmetadata, + &err)) + goto bail; } bail: diff --git a/liblangtag/lt-xml.h b/liblangtag/lt-xml.h index 917806a..065ac9e 100644 --- a/liblangtag/lt-xml.h +++ b/liblangtag/lt-xml.h @@ -33,8 +33,9 @@ typedef enum _lt_xml_cldr_t { LT_XML_CLDR_DUMMY1 = 100, LT_XML_CLDR_SUPPLEMENTAL_LIKELY_SUBTAGS, LT_XML_CLDR_SUPPLEMENTAL_SUPPLEMENTAL_DATA, + LT_XML_CLDR_SUPPLEMENTAL_SUPPLEMENTAL_METADATA, LT_XML_CLDR_SUPPLEMENTAL_BEGIN = LT_XML_CLDR_SUPPLEMENTAL_LIKELY_SUBTAGS, - LT_XML_CLDR_SUPPLEMENTAL_END = LT_XML_CLDR_SUPPLEMENTAL_SUPPLEMENTAL_DATA, + LT_XML_CLDR_SUPPLEMENTAL_END = LT_XML_CLDR_SUPPLEMENTAL_SUPPLEMENTAL_METADATA, LT_XML_CLDR_END } lt_xml_cldr_t; diff --git a/tests/check-tag.c b/tests/check-tag.c index 976303e..df9d581 100644 --- a/tests/check-tag.c +++ b/tests/check-tag.c @@ -184,6 +184,21 @@ TDEF (lt_tag_canonicalize) { fail_unless(s != NULL, "Unable to be canonicalize."); fail_unless(lt_strcmp0(s, "bzs") == 0, "Unexpected result to be canonicalized."); free(s); + fail_unless(lt_tag_parse(t1, "sh-Arab-AQ", NULL), "should be valid langtag."); + s = lt_tag_canonicalize(t1, NULL); + fail_unless(s != NULL, "Unable to canonicalize."); + fail_unless(lt_strcmp0(s, "sr-Arab-AQ") == 0, "Unexpected result to be canonicalized."); + free(s); + fail_unless(lt_tag_parse(t1, "sh", NULL), "should be valid langtag."); + s = lt_tag_canonicalize(t1, NULL); + fail_unless(s != NULL, "Unable to be canonicalize."); + fail_unless(lt_strcmp0(s, "sr-Latn") == 0, "Unexpected result to be canonicalized."); + free(s); + fail_unless(lt_tag_parse(t1, "mo", NULL), "should be valid langtag."); + s = lt_tag_canonicalize(t1, NULL); + fail_unless(s != NULL, "Unable to be canonicalize."); + fail_unless(lt_strcmp0(s, "ro-MD") == 0, "Unexpected result to be canonicalized."); + free(s); lt_tag_unref(t1); } TEND |