CambridgeProgrammerStudyGroup · bricef · Jun 14, 2016 · Jun 12, 2016 · Jun 12, 2016 · Jun 12, 2016
diff --git a/robin/caesar.py b/robin/caesar.py
@@ -0,0 +1,40 @@
+#!/usr/bin/python
+
+import string
+from quadgram import Scorer
+
+
+ALPHABET = list(string.uppercase)
+
+
+def decrypt(key, text):
+    table = dict(zip(ALPHABET[key:]+ALPHABET[:key], ALPHABET))
+    return ''.join(table.get(ch, ch) for ch in text.upper())
+
+
+def solve(text, scorer=None):
+    if scorer is None:
+        scorer = Scorer.make_pretrained()
+
+    best_key = None
+    best_score = None
+
+    for key in range(len(ALPHABET)):
+        score = scorer.score(decrypt(key, text))
+        if best_key is None or score > best_score:
+            best_key = key
+            best_score = score
+
+    return best_key
+
+
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) == 2:
+        with open(sys.argv[1]) as f:
+            text = f.read()
+        key = solve(text)
+        print key
+        print decrypt(key, text)
+    else:
+        print >>sys.stderr, "Usage %s filename" % sys.argv[0]
diff --git a/robin/polysubstitution.py b/robin/polysubstitution.py
@@ -0,0 +1,44 @@
+#!/usr/bin/python
+
+import string
+
+ALPHABET = list(string.uppercase)
+
+
+def get_letters(chars):
+    for ch in chars:
+        ch = ch.upper()
+        if ch in ALPHABET:
+            yield ch
+
+
+def get_freq_dicts(chars, period):
+    freq_dicts = []
+    for i in range(period):
+        freq_dicts.append(dict((ch, 0) for ch in ALPHABET))
+
+    for i, ch in enumerate(get_letters(chars)):
+        freq_dicts[i % period][ch] += 1
+
+    return freq_dicts
+
+
+def get_index_of_coincidence(chars, period):
+    ics = []
+    for freq_dict in get_freq_dicts(chars, period):
+        counts = freq_dict.values()
+        total = float(sum(counts))
+        ics.append(sum(count*(count-1) for count in counts) / (total * (total-1)))
+    return sum(ics) / len(ics)
+
+
+if __name__ == '__main__':
+    import sys
+
+    if len(sys.argv) == 2:
+        with open(sys.argv[1]) as f:
+            text = f.read()
+        for period in range(1, 20):
+            print "%3d\t%04f" % (period, get_index_of_coincidence(text, period))
+    else:
+        print >>sys.stderr, "Usage %s filename" % sys.argv[0]
diff --git a/robin/quadgram.py b/robin/quadgram.py
@@ -0,0 +1,86 @@
+from collections import defaultdict
+from math import log
+
+class Scorer(object):
+    def __init__(self):
+        self.total = 0
+        self.freqs = defaultdict(int)
+
+    @classmethod
+    def make_pretrained(cls, filename=None):
+
+        if filename is None:
+            from os.path import dirname, join
+            filename = join(dirname(__file__), "quadgram_freqs.txt")
+
+        scorer = cls()
+        with open(filename) as f:
+            for line in f:
+                if not line.startswith('#'):
+                    parts = line.split()
+                    seq = parts[0]
+                    count = int(parts[1])
+                    scorer.freqs[seq] = count
+                    scorer.total += count
+        return scorer
+
+    def populate(self, chars):
+        for seq in get_quadgrams(chars):
+            self.total += 1
+            self.freqs[seq] += 1
+
+    def populate_from_file(self, filename):
+        self.populate(read_filechars(filename))
+
+    def score(self, text):
+        p = 0.0
+        divisor = float(self.total)
+        for seq in get_quadgrams(text):
+            if seq in self.freqs:
+                count = self.freqs[seq]
+            else:
+                count = 0.1
+
+            p += log(count / divisor)
+        return p
+
+
+def read_filechars(filename):
+    with open(filename) as f:
+        while True:
+            ch = f.read(1)
+            if not ch: break
+            yield ch
+
+
+def get_quadgrams(chars):
+    seq = ""
+    for ch in chars:
+        if ch.isdigit():
+            ch = '.'
+        elif ch.isalpha():
+            ch = ch.upper()
+        else:
+            continue
+
+        if len(seq) == 4:
+            seq = seq[1:]
+
+        seq = seq + ch
+
+        if '.' not in seq:
+            yield seq
+
+
+if __name__ == "__main__":
+    import sys
+
+    scorer = Scorer()
+
+    for filename in sys.argv[1:]:
+        scorer.populate_from_file(filename)
+
+    quadgrams = list(scorer.freqs.keys())
+    quadgrams.sort()
+    for quadgram in quadgrams:
+        sys.stdout.write("%s\t%d\n" % (quadgram, scorer.freqs[quadgram]))