python_http_stuff/ichimoe.py at main · nicolft/python_http_stuff · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import csv
import requests
import re
from bs4 import BeautifulSoup
from typing import TypedDict

ichimoe_url = 'https://ichi.moe/cl/qr/'

class Word(TypedDict):
    jp: str
    reading: str
    trans: str

def get_words(jp_text: str) -> dict[str, Word]:
    words : dict[str, Word] = dict()

    # Get Ichimoe page HTML
    jp_text = jp_text.replace("\n", " ")
    response : requests.Response = requests.get(
        ichimoe_url, params={'q': jp_text}
        )

    if response.status_code == 200:
        # In ichi.moe, every word is described within a <dl> tag.
        # <dl>
        #   <dt>WORD</dt>
        #   <dd>info. and defn. about word</dd>
        # </dl>
        # These may be recursively nested (inside the dd tag) to show
        # compound words and conjugations.
        #
        # Hence, we filter for dictionary forms by getting only
        # <dl> tags which have no <dl> as descendant.

        soup = BeautifulSoup(response.text, 'html.parser')

        # Get all <dt> tags and add its content to the words sets.
        dt_tags = soup.find_all('dt')
        for dt_tag in dt_tags:
            text = re.sub(r'^\d+\.\s*', '', dt_tag.text).strip().split(' 【')
            jp = text[0]
            reading = jp
            if len(text) > 1:
                reading = text[1][:-1]
            trans = dt_tag.find_next_sibling('dd').find('span', 'gloss-desc')
            if trans != None:
                trans = trans.text.strip()
            else:
                trans = ''

            words[jp] = {'jp': jp, 'reading': reading, 'trans': trans}

        # For every <dt> tag, find the first parent <dl> tag and remove
        # their child <dt> content from the set.
        for dt_tag in dt_tags:
            dl_opt = dt_tag.find_parent('dl').find_parent('dl')
            if dl_opt != None:
                dt_opt = dl_opt.find('dt', recursive=False)
                if dt_opt != None:
                    words.pop(re.sub(r'^\d+\.\s*', '', dt_opt.text).strip().split(' 【')[0], None)

    return words

if __name__ == "__main__":
    '''
    Take an input file (which contains Japanese) and output
    Japanese words, readings, and translations into a .tsv file.
    '''
    import sys

    if len(sys.argv) != 2:
        print(f"Usage: python3 {sys.argv[0]} <input_file>")
        sys.exit(1)

    input_file = sys.argv[1]

    with open(input_file, "r", encoding="utf-8") as f:
        s = f.read()

    # Ichimoe can handle upwards of 700+ characters. But somewhere there is a limit.
    paragraphs = s.split('\n')

    words = dict()
    for paragraph in paragraphs:
        words |= get_words(paragraph)

    with open('ichimoe_output.tsv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerow(['Japanese', 'Reading', 'Translation'])  # Header
        for item in words.values():
            writer.writerow([item['jp'], item['reading'], item['trans']])