# From: http://www.w3.org/TR/REC-xml#NT-CombiningChar # # * Name start characters must have one of the categories Ll, Lu, Lo, # Lt, Nl. # # * Name characters other than Name-start characters must have one of # the categories Mc, Me, Mn, Lm, or Nd. # # * Characters in the compatibility area (i.e. with character code # greater than #xF900 and less than #xFFFE) are not allowed in XML # names. # # * Characters which have a font or compatibility decomposition # (i.e. those with a "compatibility formatting tag" in field 5 of the # database -- marked by field 5 beginning with a "<") are not allowed. # # * The following characters are treated as name-start characters rather # than name characters, because the property file classifies them as # Alphabetic: [#x02BB-#x02C1], #x0559, #x06E5, #x06E6. # # * Characters #x20DD-#x20E0 are excluded (in accordance with Unicode # 2.0, section 5.14). # # * Character #x00B7 is classified as an extender, because the property # list so identifies it. # # * Character #x0387 is added as a name character, because #x00B7 is its # canonical equivalent. # # * Characters ':' and '_' are allowed as name-start characters. # # * Characters '-' and '.' are allowed as name characters. from unicodedata import category, decomposition NAME_START_CATEGORIES = ["Ll", "Lu", "Lo", "Lt", "Nl"] NAME_CATEGORIES = NAME_START_CATEGORIES + ["Mc", "Me", "Mn", "Lm", "Nd"] ALLOWED_NAME_CHARS = [u"\u00B7", u"\u0387", u"-", u".", u"_"] # http://www.w3.org/TR/REC-xml-names/#NT-NCName # [4] NCName ::= (Letter | '_') (NCNameChar)* /* An XML Name, minus # the ":" */ # [5] NCNameChar ::= Letter | Digit | '.' | '-' | '_' | CombiningChar # | Extender def is_ncname(name): first = name[0] if first=="_" or category(first) in NAME_START_CATEGORIES: for i in xrange(1, len(name)): c = name[i] if not category(c) in NAME_CATEGORIES: if c in ALLOWED_NAME_CHARS: continue return 0 #if in compatibility area #if decomposition(c)!='': # return 0 return 1 else: return 0 def split_uri(predicate): predicate = predicate length = len(predicate) for i in xrange(0, length): if not category(predicate[-i-1]) in NAME_CATEGORIES: for j in xrange(-1-i, length): if category(predicate[j]) in NAME_START_CATEGORIES: ns = predicate[:j] if not ns: break ln = predicate[j:] return (ns, ln) break raise Error("This graph cannot be serialized in RDF/XML. Could not split predicate: '%s'" % predicate)