Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
*.pyc
*~
26 changes: 26 additions & 0 deletions robin/1-gram_freqs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
A 303396
B 60092
C 90005
D 165009
E 486510
F 87232
G 74878
H 245124
I 259236
J 5641
K 24720
L 152602
M 104040
N 272454
O 295514
P 58819
Q 4436
R 231153
S 235731
T 336581
U 109869
V 40898
W 92701
X 6244
Y 85240
Z 1730
6 changes: 0 additions & 6 deletions robin/quadgram_freqs.txt → robin/4-gram_freqs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37901,7 +37901,6 @@ OZEA 1
OZEN 32
OZEW 2
OZWH 1
P 1
PAAN 12
PAAT 1
PABE 1
Expand Down Expand Up @@ -38919,7 +38918,6 @@ PPYW 46
PPYY 11
PQUA 1
PQUI 9
PR 1
PRAC 64
PRAI 160
PRAN 53
Expand Down Expand Up @@ -38957,7 +38955,6 @@ PRIS 344
PRIT 4
PRIV 166
PRIZ 102
PRO 1
PROA 219
PROB 398
PROC 241
Expand Down Expand Up @@ -46272,7 +46269,6 @@ SYWR 1
SYYE 2
SYYO 3
SZEA 3
T 1
TAAG 1
TAAL 3
TAAN 16
Expand Down Expand Up @@ -47201,7 +47197,6 @@ TGUE 8
TGUI 5
TGUS 1
TGUT 177
TH 1
THAA 4
THAB 52
THAC 69
Expand Down Expand Up @@ -47249,7 +47244,6 @@ THDO 17
THDR 44
THDU 6
THDW 2
THE 1
THEA 1695
THEB 1584
THEC 3200
Expand Down
10 changes: 5 additions & 5 deletions robin/caesar.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python

import string
from quadgram import Scorer
from ngram import NGramSet


ALPHABET = list(string.uppercase)
Expand All @@ -12,15 +12,15 @@ def decrypt(key, text):
return ''.join(table.get(ch, ch) for ch in text.upper())


def solve(text, scorer=None):
if scorer is None:
scorer = Scorer.make_pretrained()
def solve(text, ngram_set=None):
if ngram_set is None:
ngram_set = NGramSet.make_pretrained()

best_key = None
best_score = None

for key in range(len(ALPHABET)):
score = scorer.score(decrypt(key, text))
score = ngram_set.score(decrypt(key, text))
if best_key is None or score > best_score:
best_key = key
best_score = score
Expand Down
48 changes: 30 additions & 18 deletions robin/quadgram.py → robin/ngram.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,49 @@
from collections import defaultdict
from math import log

class Scorer(object):
def __init__(self):
class NGramSet(object):
def __init__(self, n):
self.total = 0
self.freqs = defaultdict(int)
self.n = n

@classmethod
def make_pretrained(cls, filename=None):
def make_pretrained(cls, n=4, filename=None):

if filename is None:
from os.path import dirname, join
filename = join(dirname(__file__), "quadgram_freqs.txt")
filename = join(dirname(__file__), "%d-gram_freqs.txt" % n)

scorer = cls()
scorer = cls(n)
with open(filename) as f:
for line in f:
if not line.startswith('#'):
parts = line.split()
seq = parts[0]
count = int(parts[1])
assert len(seq) == n
scorer.freqs[seq] = count
scorer.total += count
return scorer

def populate(self, chars):
for seq in get_quadgrams(chars):
for seq in get_ngrams(chars, self.n):
self.total += 1
self.freqs[seq] += 1

def populate_from_file(self, filename):
self.populate(read_filechars(filename))

def freq(self, seq):
if seq in self.freqs:
return self.freqs[seq]
else:
return 0

def score(self, text):
p = 0.0
divisor = float(self.total)
for seq in get_quadgrams(text):
for seq in get_ngrams(text, self.n):
if seq in self.freqs:
count = self.freqs[seq]
else:
Expand All @@ -53,7 +61,7 @@ def read_filechars(filename):
yield ch


def get_quadgrams(chars):
def get_ngrams(chars, n):
seq = ""
for ch in chars:
if ch.isdigit():
Expand All @@ -63,24 +71,28 @@ def get_quadgrams(chars):
else:
continue

if len(seq) == 4:
seq = seq[1:]

seq = seq + ch

if '.' not in seq:
if len(seq) > n:
seq = seq[-n:]

if len(seq) == n and '.' not in seq:
yield seq


if __name__ == "__main__":
import sys

scorer = Scorer()
if len(sys.argv) < 2:
print >>sys.stderr, "Usage %s n [file...]" % sys.argv[0]
sys.exit(1)

ngram_set = NGramSet(int(sys.argv[1]))

for filename in sys.argv[1:]:
for filename in sys.argv[2:]:
scorer.populate_from_file(filename)

quadgrams = list(scorer.freqs.keys())
quadgrams.sort()
for quadgram in quadgrams:
sys.stdout.write("%s\t%d\n" % (quadgram, scorer.freqs[quadgram]))
ngrams = list(scorer.freqs.keys())
ngrams.sort()
for ngram in ngrams:
sys.stdout.write("%s\t%d\n" % (ngram, scorer.freqs[ngram]))
58 changes: 54 additions & 4 deletions robin/polysubstitution.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

import string

ALPHABET = list(string.uppercase)
from substitution import SubstitutionSolver, ALPHABET


def get_letters(chars):
Expand Down Expand Up @@ -32,6 +31,47 @@ def get_index_of_coincidence(chars, period):
return sum(ics) / len(ics)


class PolySubstitutionSolver(SubstitutionSolver):
"""
Solves polysubstitution ciphers. This is not the same as polyalphabetic
substitution ciphers, such as Vigenère. What it means is that each letter
of plain text gets replaced with a token consisting of a number of
letters (all the tokens having the same length).
"""
def __init__(self, token_length, *args):
super(PolySubstitutionSolver, self).__init__(*args)
self.token_length = token_length

def get_ciphertext(self, text):
result = []
current = ''

for ch in text.upper():
if ch not in ALPHABET:
result.append(ch)
else:
current += ch
if len(current) == self.token_length:
result.append(current)
current = ''

return result

def get_cipher_alphabet(self, ciphertext):
cipher_tokens = set()
for ch in ciphertext:
if len(ch) == self.token_length and ch[0] in ALPHABET:
cipher_tokens.add(ch)

if len(cipher_tokens) > len(ALPHABET):
raise Exception("Too many (%d) tokens in cipher alphabet" % len(cipher_tokens))

for i in range(len(ALPHABET) - len(cipher_tokens)):
cipher_tokens.add("dummy-%02d" % i)

return cipher_tokens


if __name__ == '__main__':
import sys

Expand All @@ -40,5 +80,15 @@ def get_index_of_coincidence(chars, period):
text = f.read()
for period in range(1, 20):
print "%3d\t%04f" % (period, get_index_of_coincidence(text, period))
elif len(sys.argv) == 3:
token_length = int(sys.argv[1])
solver = PolySubstitutionSolver(token_length)
with open(sys.argv[2]) as f:
text = f.read()
key, plaintext = solver.solve(text)
print key
print plaintext
else:
print >>sys.stderr, "Usage %s filename" % sys.argv[0]
print >>sys.stderr, "Usage:"
print >>sys.stderr, " %s filename" % sys.argv[0]
print >>sys.stderr, " %s token_length filename" % sys.argv[0]
Loading