1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
|
import argparse import os import re import sys from collections import Counter from fnmatch import fnmatch
libraries = { "c": r"[^\W]", "cp": r"[^\W]|[\s]", "cn": r"[\u4e00-\u9fff]", "en": r"[a-zA-Z]", "alnum": r"[a-zA-Z\d]", "num": r"[\d]", "sp": r"[\s]", "punc": r"[^\w\s]", }
def process_file(file, regex, library, ignore_space, ignore_case, verbose): if verbose: print(f"Processing file: {file}") with open(file, 'r', encoding='utf-8') as f: content = f.read() if ignore_space: content = re.sub(r'\s', '', content) if ignore_case: content = content.lower() if regex: matches = re.findall(regex, content) elif library: matches = re.findall(libraries[library], content) else: matches = list(content) return matches
def main(): parser = argparse.ArgumentParser(description='Count the occurrences of characters in files.', epilog='''libraries: c All printable characters. cp All printable and space characters. cn All common Chinese characters. en All English alphabetic characters. alnum Alphabetic and numeric characters. num Numeric characters. sp Space characters. punc Punctuation characters.''', formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-n', '--number', metavar="number", type=int, default=0, help='The number of most common characters to display.') parser.add_argument('-e', '--expression', metavar="regex", type=str, default="", help='The regular expression to match.') parser.add_argument('-l', '--library', metavar="library", type=str, choices=libraries.keys(), help='The character set library to use.') parser.add_argument('-f', '--format', metavar="format", type=str, default="", help='The file formats to process.') parser.add_argument('-o', '--output', metavar="output", type=str, default="", help='The output file.') parser.add_argument('-r', '--reverse', action='store_true', default=False, help='Reverse the order of the output.') parser.add_argument('-R', '--recursive', action='store_true', default=False, help='Recursively process directories.') parser.add_argument('-S', '--show-space', action='store_true', default=False, help='Show whitespace characters.') parser.add_argument('-i', '--case-sensitive', action='store_true', default=False, help='Ignore case when matching.') parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Display verbose output.') parser.add_argument('paths', nargs='+', help='The files or directories to process.') args = parser.parse_args()
if args.expression and args.library: print("Error: --expression and --library options cannot be used together.") sys.exit(1)
if args.library and args.library not in libraries: print(f"Error: Unknown library --{args.library}") sys.exit(1)
file_formats = args.format.split(',') if args.format else []
results = [] processed_files = [] for path in args.paths: if os.path.isfile(path): if not file_formats or any(fnmatch(path, f'*.{fmt}') for fmt in file_formats): results.extend(process_file(path, args.expression, args.library, not args.show_space, not args.case_sensitive, args.verbose)) processed_files.append(path) elif os.path.isdir(path): for root, dirs, files in os.walk(path): for name in files: if not file_formats or any(fnmatch(name, f'*.{fmt}') for fmt in file_formats): results.extend(process_file(os.path.join(root, name), args.expression, args.library, not args.show_space, not args.case_sensitive, args.verbose)) processed_files.append(os.path.join(root, name)) if not args.recursive: break
counter = Counter(results) most_common = counter.most_common(args.number if args.number > 0 else None) most_common.sort(key=lambda x: (x[1], x[0]) if args.reverse else (-x[1], x[0]))
f = open(args.output, 'w', encoding='utf-8') if args.output else sys.stdout for i, (char, count) in enumerate(most_common, start=1): escape_dict = {" ": r"\s", "\n": r"\n", "\t": r"\t", "\r": r"\r", "\f": r"\f", "\v": r"\v", "\b": r"\b"} char = escape_dict.get(char, char) print(f"{i}\t{char}\t{count}", file=f) if args.output: f.close()
print("Processed files:") for file in processed_files: print(file)
if __name__ == "__main__": main()
|