sbk.mnemonic

src/sbk/mnemonic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# This file is part of the SBK project
# https://github.com/mbarkhau/sbk
#
# Copyright (c) 2019-2021 Manuel Barkhau (mbarkhau@gmail.com) - MIT License
# SPDX-License-Identifier: MIT

"""Wordlists for SBK."""

import os
import struct
import typing as typ

import pylev

from . import enc_util

WORDLIST_STR = """
abacus     abraham    academy    acrobat    admiral    albino     alcohol    aquarium
atlantic   attorney   augustus   avocado    bazooka    beehive    beirut     benjamin
bible      bicycle    bismarck   blanket    boeing     bohemia    bolivia    bridge
broccoli   brussels   budapest   buffalo    button     cabbage    caesar     captain
carolina   caucasus   cherry     church     cinnamon   claudius   coconut    coffee
computer   cookie     coupon     cowboy     crystal    cyprus     darwin     dentist
deputy     detroit    diploma    doctor     dolphin    donut      dortmund   dracula
dublin     eagle      earpiece   edison     egypt      elephant   elvis      embassy
ethiopia   fairy      ferrari    firefly    flower     football   france     freddie
fujitsu    galileo    gameboy    geisha     ghost      glasgow    google     gorilla
gotham     gymnast    halifax    harvard    hawaii     headset    heineken   helsinki
hendrix    hepburn    hitachi    hunter     hyundai    indiana    iphone     island
jacket     jakarta    jericho    jigsaw     joystick   judge      jukebox    julius
kabul      kafka      kangaroo   kashmir    keyboard   kidney     kimono     knight
koala      kodak      kolkata    kosovo     kurosawa   laptop     latvia     lawyer
leather    lebanon    leibniz    lenin      library    lobster    lunatic    macbook
mason      meatball   mechanic   medusa     mercury    messi      michigan   miller
miyazaki   moldova    movie      mozart     muffin     muhammad   mumbai     mushroom
myanmar    nagasaki   nairobi    nanjing    napoleon   necklace   needle     netflix
newton     normandy   obelix     onion      ontario    oregon     orwell     oxford
package    pakistan   pancake    papaya     peanut     pelican    penguin    pepper
peugeot    picasso    pigeon     pilot      pistol     pizza      plumber    podium
popcorn    potato     present    printer    prophet    pumpkin    pyramid    python
queen      rabbit     radio      renault    reporter   rhubarb    romania    ronaldo
rousseau   saddam     salmon     samurai    santiago   satoshi    sausage    school
server     sheriff    siemens    simpson    sisyphus   slippers   slovakia   socrates
soldier    sparrow    spider     squid      sultan     sunlight   surgeon    suzuki
teacup     temple     tequila    texas      titanic    tobacco    toilet     tokyo
trinidad   trumpet    tshirt     tunisia    turtle     tuxedo     twitter    ukraine
ulysses    unesco     uruguay    vampire    victoria   violin     virginia   vivaldi
vladimir   volcano    voyager    waffle     walnut     warrior    wasabi     watanabe
webcam     whisky     wizard     xerox      yoghurt    yokohama   zambia     zimbabwe
"""


WORDLIST = WORDLIST_STR.split()
WORDSET  = set(WORDLIST)

assert len(WORDLIST) == 256
assert len(WORDSET ) == 256
assert sorted(WORDLIST) == WORDLIST
assert all(5 <= len(w) <= 8 for w in WORDLIST)
assert len({w[:3] for w in WORDLIST}) == 256

WORD_INDEXES   = {w: i for i, w in enumerate(WORDLIST)}
wordlist_index = WORD_INDEXES.__getitem__

assert wordlist_index("abacus"  ) == 0
assert wordlist_index("zimbabwe") == 255
assert wordlist_index(WORDLIST[127]) == 127


PhraseStr = str


def byte2word(data: bytes) -> str:
    assert len(data) == 1
    word_idx = enc_util.char_at(data, 0)
    return WORDLIST[word_idx]


def _bytes2phrase_words(data: bytes) -> typ.Iterable[str]:
    for i in range(len(data)):
        word_idx = enc_util.char_at(data, i)
        word     = WORDLIST[word_idx]
        yield word.ljust(9)


def bytes2phrase(data: bytes) -> PhraseStr:
    """Encode data as a human readable phrases."""
    if len(data) % 2 != 0:
        errmsg = f"Invalid len(data), must be multiple of 2, was {len(data)}"
        raise ValueError(errmsg)

    words = iter(_bytes2phrase_words(data))

    word_pairs = []
    try:
        while True:
            word_pair = next(words) + " " + next(words)
            word_pairs.append(word_pair)
    except StopIteration:
        return "\n".join(word_pairs)


def fuzzy_match(word: str) -> str:
    def dist_fn(wl_word: str) -> int:
        dist = pylev.damerau_levenshtein(word, wl_word)
        assert isinstance(dist, int)
        return dist

    dist, wl_word = min((dist_fn(wl_word), wl_word) for wl_word in WORDLIST)
    if dist < 4:
        return wl_word
    else:
        errmsg = f"Unknown word: {word}"
        raise ValueError(errmsg, word)


def phrase2words(phrase: PhraseStr) -> typ.Iterable[str]:
    for word in phrase.split():
        word = word.strip().lower()
        if word not in WORDSET:
            word = fuzzy_match(word)
        yield word


def _phrase2bytes(phrase: PhraseStr) -> typ.Iterable[bytes]:
    for word in phrase2words(phrase):
        yield struct.pack("B", wordlist_index(word))


def phrase2bytes(phrase: PhraseStr) -> bytes:
    """Decode human readable phrases to bytes."""
    return b"".join(_phrase2bytes(phrase))


def main() -> None:
    test_data = os.urandom(8)
    print(bytes2phrase(test_data))


if __name__ == '__main__':
    main()