make-byte-frequency-table

Enable keyboard shortcuts

#!/usr/bin/env python

# This does simple normalized frequency analysis on UTF-8 encoded text. The

# result of the analysis is translated to a ranked list, where every byte is

# assigned a rank. This list is written to src/freqs.rs.

# Currently, the frequencies are generated from the following corpuses:

#   * The CIA world fact book

#   * The source code of rustc

#   * Septuaginta

from __future__ import absolute_import, division, print_function

import argparse

from collections import Counter

import sys

preamble = '''

// NOTE: The following code was generated by "scripts/frequencies.py", do not

// edit directly

'''.lstrip()

def eprint(*args, **kwargs):

    kwargs['file'] = sys.stderr

    print(*args, **kwargs)

def main():

    p = argparse.ArgumentParser()

    p.add_argument('corpus', metavar='FILE', nargs='+')

    args = p.parse_args()

    # Get frequency counts of each byte.

    freqs = Counter()

    for i in range(0, 256):

        freqs[i] = 0

    eprint('reading entire corpus into memory')

    corpus = []

    for fpath in args.corpus:

        corpus.append(open(fpath, 'rb').read())

    eprint('computing byte frequencies')

    for c in corpus:

        for byte in c:

            freqs[byte] += 1.0 / float(len(c))

    eprint('writing Rust code')

    # Get the rank of each byte. A lower rank => lower relative frequency.

    rank = [0] * 256

    for i, (byte, _) in enumerate(freqs.most_common()):

        # print(byte)

        rank[byte] = 255 - i

    # Forcefully set the highest rank possible for bytes that start multi-byte

    # UTF-8 sequences. The idea here is that a continuation byte will be more

    # discerning in a homogenous haystack.

    for byte in range(0xC0, 0xFF + 1):

        rank[byte] = 255

    # Now write Rust.

    olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = [']

    for byte in range(256):

        olines.append('    %3d, // %r' % (rank[byte], chr(byte)))

    olines.append('];')

    print(preamble)

    print('\n'.join(olines))

if __name__ == '__main__':

    main()