diff --git a/codevalidator.py b/codevalidator.py index 620a956..8624524 100755 --- a/codevalidator.py +++ b/codevalidator.py @@ -32,6 +32,7 @@ import sys import tempfile import shutil +import codecs if sys.version_info.major == 2: # Pythontidy is only supported on Python2 @@ -47,6 +48,10 @@ DEFAULT_CONFIG_PATHS = ['~/.codevalidatorrc', '/etc/codevalidatorrc'] +# The first rule name which matches a registered encoding is used +# both as a check that the file can be read with that encoding, +# as well as a encoding filter for those rules which support fixing. + DEFAULT_RULES = [ 'utf8', 'nobom', @@ -78,7 +83,7 @@ '*.php': DEFAULT_RULES + ['phpcs'], '*.phtml': DEFAULT_RULES, '*.pp': DEFAULT_RULES + ['puppet'], - '*.properties': DEFAULT_RULES + ['ascii'], + '*.properties': ['ascii'] + DEFAULT_RULES, '*.py': DEFAULT_RULES + ['pyflakes', 'pythontidy'], '*.rst': DEFAULT_RULES, '*.rb': DEFAULT_RULES + ['ruby', 'rubocop'], @@ -114,6 +119,7 @@ STDIN_CONTENTS = None +ENCODING_BY_FILE = dict() class BaseException(Exception): @@ -165,6 +171,35 @@ def wrap(f): return wrap +def needs_unicode(fix_function): + """ + decorator for a _fix_... function to make it work with a pair of + unicode files (or file-like objects) internally instead of a pair + of byte-files (which are still used externally). + + The returned function has an attribute `needs_encoding` which tells + the calling function that it needs an encoding argument (the name of + the encoding to use). + """ + + def wrapped_fix(src, dst, encoding_or_options): + if isinstance(encoding_or_options, basestring): + encoding = encoding_or_options + options = None + else: + encoding = encoding_or_options['encoding'] + options = encoding_or_options + # decode + encode + src = codecs.EncodedFile(src, encoding) + dst = codecs.EncodedFile(dst, encoding) + if options: + return fix_function(src, dst, options) + else: + return fix_function(src, dst) + + wrapped_fix.needs_encoding = True + return wrapped_fix + def is_python3(fd): '''check first line of file object whether it contains "python3" (shebang)''' @@ -191,10 +226,11 @@ def _validate_notabs(fd): return b'\t' not in fd.read() +@needs_unicode def _fix_notabs(src, dst): original = src.read() fixed = original.replace(b'\t', b' ' * 4) - dst.write(fixed.decode()) + dst.write(fixed) @message('contains carriage return (CR)') @@ -202,32 +238,21 @@ def _validate_nocr(fd): return b'\r' not in fd.read() +@needs_unicode def _fix_nocr(src, dst): original = src.read() - fixed = original.replace(b'\r', b'') - dst.write(fixed.decode()) - - -@message('is not UTF-8 encoded') -def _validate_utf8(fd): - ''' - >>> _validate_utf8(BytesIO(b'foo')) - True - ''' - try: - fd.read().decode('utf-8') - except UnicodeDecodeError: - return False - return True + fixed = original.replace('\r', '') + dst.write(fixed) -@message('is not ASCII encoded') -def _validate_ascii(fd): - try: - fd.read().decode('ascii') - except UnicodeDecodeError: - return False - return True +def encoding_validator(encoding): + def validate_encoding(fd): + try: + fd.read().decode(encoding) + except UnicodeDecodeError: + return "is not %s-encoded" % encoding.upper() + return True + return validate_encoding @message('has UTF-8 byte order mark (BOM)') @@ -263,6 +288,7 @@ def _validate_notrailingws(fd): return True +@needs_unicode def _fix_notrailingws(src, dst): for line in src: dst.write(line.rstrip()) @@ -806,15 +832,29 @@ def notify(*args): print(*args) +def get_encoding_rule(rules): + for rule in rules: + try: + codecs.lookup(rule) + return rule + except LookupError: + continue + + def validate_file_with_rules(fname, rules): + encoding = get_encoding_rule(rules) + ENCODING_BY_FILE[fname] = encoding with open_file_for_read(fname) as fd: for rule in rules: logging.debug('Validating %s with %s..', fname, rule) fd.seek(0) func = globals().get('_validate_' + rule) if not func: - notify(rule, 'does not exist') - continue + if rule == encoding: + func = encoding_validator(encoding) + else: + notify(rule, 'does not exist') + continue options = CONFIG.get('options', {}).get(rule) try: if options: @@ -870,6 +910,7 @@ def fix_file(fname, rules): if CONFIG.get('create_backup', True): dirname, basename = os.path.split(fname) shutil.copy2(fname, os.path.join(dirname, CONFIG['backup_filename'].format(original=basename))) # creates a backup + encoding = ENCODING_BY_FILE[fname] with open_file_for_read(fname) as fd: dst = fd for rule in rules: @@ -882,7 +923,10 @@ def fix_file(fname, rules): src.seek(0) try: if options: + options['encoding'] = encoding func(src, dst, options) + elif func.needs_encoding: + func(src, dst, encoding) else: func(src, dst) was_fixed &= True @@ -896,7 +940,7 @@ def fix_file(fname, rules): # b) some fix functions destroyed the code if was_fixed and len(fixed) > 0: with open_file_for_write(fname) as fd: - fd.write(fixed.encode()) + fd.write(fixed) return True else: notify('{0}: ERROR fixing file. File remained unchanged'.format(fname)) diff --git a/test/unicode-test.yaml b/test/unicode-test.yaml new file mode 100644 index 0000000..bf77456 --- /dev/null +++ b/test/unicode-test.yaml @@ -0,0 +1,3 @@ +example: | + This line ends with a space. + This line has non-ASCII signs in it – which breaks codevalidator 0.8.2.