#!/usr/bin/python """$Id$ Test XML character decoding against a range of encodings, valid and not.""" __author__ = "Joseph Walton " __version__ = "$Revision$" __copyright__ = "Copyright (c) 2004, 2006 Joseph Walton" import os, sys import codecs import re curdir = os.path.abspath(os.path.dirname(__file__)) srcdir = os.path.split(curdir)[0] if srcdir not in sys.path: sys.path.insert(0, srcdir) basedir = os.path.split(srcdir)[0] skippedNames = [] import unittest, new, glob, re from feedvalidator import xmlEncoding class EncodingTestCase(unittest.TestCase): def testEncodingMatches(self): try: enc = xmlEncoding.detect(self.bytes) except UnicodeError,u: self.fail("'" + self.filename + "' should not cause an exception (" + str(u) + ")") self.assert_(enc, 'An encoding must be returned for all valid files (' + self.filename + ')') self.assertEqual(enc, self.expectedEncoding, 'Encoding for ' + self.filename + ' should be ' + self.expectedEncoding + ', but was ' + enc) def testEncodingFails(self): eventLog = [] try: encoding = xmlEncoding.detect(self.bytes, eventLog) except UnicodeError,u: self.fail("'" + self.filename + "' should not cause an exception (" + str(u) + ")") if encoding: self.fail("'" + self.filename + "' should not parse successfully (as " + encoding + ")") if not(eventLog): self.fail("'" + self.filename + "' should give a reason for parse failure") bom8='\xEF\xBB\xBF' bom16BE='\xFE\xFF' bom16LE='\xFF\xFE' bom32BE='\x00\x00\xFE\xFF' bom32LE='\xFF\xFE\x00\x00' # Some fairly typical Unicode text. It should survive XML roundtripping. docText=u'\u201c"This\uFEFF" is\na\r\u00A3t\u20Acst\u201D' validDecl = re.compile('[A-Za-z][-A-Za-z0-9._]*') def makeDecl(enc=None): if enc: assert validDecl.match(enc), "'" + enc + "' is not a valid encoding name" return "" else: return "" def encoded(enc, txt=docText): return codecs.getencoder(enc)(txt, 'xmlcharrefreplace')[0] def genValidXmlTestCases(): someFailed = False # Required yield('UTF-8', ['BOM', 'declaration'], bom8 + makeDecl('UTF-8') + encoded('UTF-8')) yield('UTF-8', [], encoded('UTF-8')) yield('UTF-8', ['noenc'], makeDecl() + encoded('UTF-8')) yield('UTF-8', ['declaration'], makeDecl('UTF-8') + encoded('UTF-8')) yield('UTF-8', ['BOM'], bom8 + encoded('UTF-8')) yield('UTF-8', ['BOM', 'noenc'], bom8 + makeDecl('UTF-8') + encoded('UTF-8')) yield('UTF-16', ['BOM', 'declaration', 'BE'], bom16BE + encoded('UTF-16BE', makeDecl('UTF-16') + docText)) yield('UTF-16', ['BOM', 'declaration', 'LE'], bom16LE + encoded('UTF-16LE', makeDecl('UTF-16') + docText)) yield('UTF-16', ['BOM', 'BE'], bom16BE + encoded('UTF-16BE')) yield('UTF-16', ['BOM', 'BE', 'noenc'], bom16BE + encoded('UTF-16BE', makeDecl() + docText)) yield('UTF-16', ['BOM', 'LE'], bom16LE + encoded('UTF-16LE')) yield('UTF-16', ['BOM', 'LE', 'noenc'], bom16LE + encoded('UTF-16LE', makeDecl() + docText)) yield('UTF-16', ['declaration', 'BE'], encoded('UTF-16BE', makeDecl('UTF-16') + docText)) yield('UTF-16', ['declaration', 'LE'], encoded('UTF-16LE', makeDecl('UTF-16') + docText)) # Standard wide encodings try: yield('ISO-10646-UCS-2', ['BOM', 'declaration', 'BE'], bom16BE + encoded('UCS-2BE', makeDecl('ISO-10646-UCS-2') + docText)) yield('ISO-10646-UCS-2', ['BOM', 'declaration', 'LE'], bom16LE + encoded('UCS-2LE', makeDecl('ISO-10646-UCS-2') + docText)) yield('UTF-32', ['BOM', 'declaration', 'BE'], bom32BE + encoded('UTF-32BE', makeDecl('UTF-32') + docText)) yield('UTF-32', ['BOM', 'declaration', 'LE'], bom32LE + encoded('UTF-32LE', makeDecl('UTF-32') + docText)) yield('UTF-32', ['declaration', 'BE'], encoded('UTF-32BE', makeDecl('UTF-32') + docText)) yield('UTF-32', ['declaration', 'LE'], encoded('UTF-32LE', makeDecl('UTF-32') + docText)) yield('ISO-10646-UCS-4', ['BOM', 'declaration', 'BE'], bom32BE + encoded('UCS-4BE', makeDecl('ISO-10646-UCS-4') + docText)) yield('ISO-10646-UCS-4', ['BOM', 'declaration', 'LE'], bom32LE + encoded('UCS-4LE', makeDecl('ISO-10646-UCS-4') + docText)) except LookupError, e: print e someFailed = True # Encodings that don't have BOMs, and require declarations withDeclarations = [ # Common ASCII-compatible encodings 'US-ASCII', 'ISO-8859-1', 'ISO-8859-15', 'WINDOWS-1252', # EBCDIC 'IBM037', 'IBM038', # Encodings with explicit endianness 'UTF-16BE', 'UTF-16LE', 'UTF-32BE', 'UTF-32LE', # (UCS doesn't seem to define endian'd encodings) ] for enc in withDeclarations: try: yield(enc, ['declaration'], encoded(enc, makeDecl(enc) + docText)) except LookupError, e: print e someFailed = True # 10646-UCS encodings, with no BOM but with a declaration try: yield('ISO-10646-UCS-2', ['declaration', 'BE'], encoded('UCS-2BE', makeDecl('ISO-10646-UCS-2') + docText)) yield('ISO-10646-UCS-2', ['declaration', 'LE'], encoded('UCS-2LE', makeDecl('ISO-10646-UCS-2') + docText)) yield('ISO-10646-UCS-4', ['declaration', 'BE'], encoded('UCS-4BE', makeDecl('ISO-10646-UCS-4') + docText)) yield('ISO-10646-UCS-4', ['declaration', 'LE'], bom32LE + encoded('UCS-4LE', makeDecl('ISO-10646-UCS-4') + docText)) except LookupError, e: print e someFailed = True # Files with aliases for declarations. The declared alias should be # reported back, rather than the canonical form. try: yield('csUnicode', ['alias', 'BOM', 'BE'], bom16BE + encoded('UCS-2BE', makeDecl('csUnicode') + docText)) yield('csUnicode', ['alias', 'LE'], encoded('UCS-2LE', makeDecl('csUnicode') + docText)) yield('csucs4', ['alias', 'BE'], encoded('csucs4', makeDecl('csucs4') + docText)) except LookupError, e: print e someFailed = True if someFailed: print "Unable to generate some tests; see README for details" def genInvalidXmlTestCases(): # Invalid files someFailed = False # UTF-32 with a non-four-byte declaration try: yield('UTF-32', ['BOM', 'BE', 'declaration'], encoded('UTF-32', makeDecl('US-ASCII') + docText)) except LookupError, e: print e someFailed = True # UTF-16 with a non-two-byte declaration yield('UTF-16', ['BOM', 'BE', 'declaration'], encoded('UTF-16', makeDecl('UTF-8') + docText)) # UTF-16BE, with a BOM yield('UTF-16BE', ['BOM', 'declaration'], bom16BE + encoded('UTF-16BE', makeDecl('UTF-16BE') + docText)) # UTF-8, with a BOM, declaring US-ASCII yield('UTF-8', ['BOM', 'declaration'], bom8 + encoded('UTF-8', makeDecl('US-ASCII') + docText)) try: # UTF-32, with a BOM, beginning without a declaration yield('UTF-32', ['BOM', 'BE'], bom32BE + encoded('UTF-32BE')) # UTF-32, with a BOM, and a declaration with no encoding yield('UTF-32', ['BOM', 'BE', 'noenc'], bom32BE + encoded('UTF-32BE', makeDecl() + docText)) except LookupError, e: print e someFailed = True # UTF-16, no BOM, no declaration # yield('UTF-16', ['BE'], encoded('UTF-16BE')) # This case falls through, and is identified as UTF-8; leave it out # until we're doing decoding as well as detection. if someFailed: print "Unable to generate some tests; see README for details" def genXmlTestCases(): for (enc, t, x) in genValidXmlTestCases(): yield (enc, t, x, True) for (enc, t, x) in genInvalidXmlTestCases(): yield (enc, t, x, False) def buildTestSuite(): import codecs suite = unittest.TestSuite() for (enc, t, x, valid) in genXmlTestCases(): t.sort() if valid: pfx = 'valid_' else: pfx = 'invalid_' name = pfx + '_'.join([enc] + t) + '.xml' # name, x is content try: alias = enc if enc.startswith('ISO-10646-'): alias = enc[10:] c = codecs.lookup(alias) if valid: t = EncodingTestCase('testEncodingMatches') t.expectedEncoding = enc else: t = EncodingTestCase('testEncodingFails') t.filename = name t.bytes = x suite.addTest(t) except LookupError,e: print "Skipping " + name + ": " + str(e) skippedNames.append(name) return suite if __name__ == "__main__": s = buildTestSuite() unittest.TextTestRunner().run(s) if skippedNames: print "Tests skipped:",len(skippedNames) print "Please see README for details"