Mokuro: Read Japanese manga with selectable text inside a browser

Manga Text Search: Python Version

This is a re-write of the text search script, this one written in Python.

This one creates the “Searchable Text” files, so a separate process to extract the text is not needed.

Main File

search.py
from natsort import natsorted
import json, os, re

manga_folder = '/home/chris/Books/Comics/Japanese/'
search = "ありがとうご"

# Be sure to install the following:
#   pip install natsort

# Note: This assumes manga volume folders are organized into series folders, such as:
# - Dragon Ball
#   - Dragon Ball Volume 1
#   - Dragon Ball Volume 2
# - Sailormoon
#   - Sailormoon Volume 1
#   - Sailormoon Volume 2

def find_ocr_folders(directory):

    cache_file = 'ocr_folders.txt'
    subfolders_with_name = []

    if os.path.exists(cache_file):
        with open(cache_file, 'r') as file:
            subfolders_with_name = [line.strip() for line in file.readlines()]

    else:
        for dirpath, dirnames, filenames in os.walk(directory):
            if '_ocr' in dirnames:
                subfolders_with_name.append(os.path.join(dirpath, '_ocr'))
        with open(cache_file,'w+') as file:
            file.writelines('\n'.join(subfolders_with_name))

    return subfolders_with_name


def list_subdirectories(directory):

    subdirectories = []
    for entry in os.listdir(directory):
        entry_path = os.path.join(directory, entry)
        if os.path.isdir(entry_path):
            subdirectories.append(entry_path)
    return subdirectories


def get_parent_folder_name(path):

    parent_folder = os.path.basename(os.path.dirname(path))
    return parent_folder


def get_json_files(directory):

    json_files = [f for f in os.listdir(directory) if f.endswith(".json")]
    return json_files


def read_searchable_text_file(volume_directory, search_text_file):

    json_file_names = natsorted(get_json_files(volume_directory))
    if not os.path.isfile(search_text_file):
        return create_searchable_text_file(volume_directory, search_text_file)
    else:
        with open(search_text_file, 'r', encoding='utf-8') as file:
            return file.read()


def create_searchable_text_file(volume_directory, search_text_file):

    json_file_names = natsorted(get_json_files(volume_directory))
    output_texts = []
    if not os.path.isfile(search_text_file) or True:
        for json_file_name in json_file_names:
            json_file = os.path.join(volume_directory, json_file_name)
            with open(json_file, 'r', encoding='utf-8') as file:
                data = json.load(file)
            if not 'blocks' in data:
                continue
            for block in data['blocks']:
                if not 'lines' in block:
                    continue
                output_texts.append(f"{os.path.splitext(json_file_name)[0]}\t{''.join(block['lines'])}")

    output = '\n'.join(output_texts)

    with open(search_text_file, 'w', encoding='utf-8') as file:
        file.write(output)

    return output

def get_image_file_path(volume_directory, image_base_name):

    # Get the path to the original image.
    path_list = volume_directory.split(os.path.sep)
    path_list.remove('_ocr')
    image_directory = os.path.sep + os.path.join(*path_list)
    image_without_extension = os.path.join(image_directory, image_base_name)
    common_extensions = ['.jpg', '.jpeg', '.png']
    for extension in common_extensions:
        possible_file = image_without_extension + extension
        if os.path.isfile(possible_file):
            return possible_file
    return ''

def html_for_match(match, volume_directory, image_base_name):

    image_file_path = get_image_file_path(volume_directory, image_base_name)
    line_with_style = line_text[:match.start()] + '<strong>' + match.group() + '</strong>' + line_text[match.end():]
    return f"<li tabindex='0' onfocus='showImage(this, \"{image_file_path}\")'>{image_base_name}: {line_with_style}</li>"

# Get a list of all _ocr folders.
ocr_folders = natsorted(find_ocr_folders(manga_folder))

output_lines = []

output_lines.append('<link rel="stylesheet" href="styles.css">')
output_lines.append('<script src="script.js"></script>')

output_lines.append('<div id="parent">')
output_lines.append('<div id="matches">')

for ocr_folder in ocr_folders:
    series_name = get_parent_folder_name(ocr_folder)
    match_found_for_series = False
    volume_directories = natsorted(list_subdirectories(ocr_folder))
    for volume_directory in volume_directories:

        volume_name = os.path.basename(volume_directory)
        match_found_for_volume = False

        search_text_file = os.path.join(volume_directory, f'{volume_name} Searchable Text.txt')
        text = read_searchable_text_file(volume_directory, search_text_file)
        lines = text.strip().split('\n')
        for line in lines:
            image_base_name, line_text = line.split('\t')
            match = re.search(search, line_text)
            if match:
                if not match_found_for_series:
                    output_lines.append(f'<h2>{series_name}</h2>')
                    match_found_for_series = True
                if not match_found_for_volume:
                    output_lines.append(f'<h3>{volume_name}</h3>')
                    output_lines.append('<ul>')
                    match_found_for_volume = True
                output_lines.append(html_for_match(match, volume_directory, image_base_name))
        if match_found_for_volume:
            output_lines.append('</ul>')

output_lines.append('</div>')

output_lines.append('<div id="page">')
output_lines.append('<a id="link" target="_blank"><img id="image" /></a>')
output_lines.append('</div>')

output_lines.append('</div>')

output_file = 'results.html'
with open(output_file, 'w', encoding='utf-8') as file:
    file.write('\n'.join(output_lines))

Additional Files

This uses the script.js and style.css files from the original text search post.

Requirements

You can install natsort from pip:

pip install natsort

On my setup, this line would be:

pip3.10 install natsort

I imagine there’s a variance for Windows.

Setup and Usage

Modify these two variables in the file to point to where you keep your manga series folders and what you want to search for (regular expression supported):

manga_folder = '/home/chris/Books/Comics/Japanese/'
search = "ありがとうご"

Run the file from a command-line prompt:

python search.py

The first run may take a bit of time while it locates all the _ocr folders. (If subsequent runs take too long, let me know and I can look into caching the _ocr folder list.)

Once the search completes, it will generate a file called results.html.

Caveats

Filesystem Structure

This expects that you have your manga images in per-volume folders, and that the volume folders are in per-series folders, such as:

image

Windows Support

I haven’t tried it out on Windows yet, but I wrote the code to be filesystem agnostic.

3 Likes