Manga Text Search: Python Version
This is a re-write of the text search script, this one written in Python.
This one creates the “Searchable Text” files, so a separate process to extract the text is not needed.
Main File
search.py
from natsort import natsorted
import json, os, re
manga_folder = '/home/chris/Books/Comics/Japanese/'
search = "ありがとうご"
# Be sure to install the following:
# pip install natsort
# Note: This assumes manga volume folders are organized into series folders, such as:
# - Dragon Ball
# - Dragon Ball Volume 1
# - Dragon Ball Volume 2
# - Sailormoon
# - Sailormoon Volume 1
# - Sailormoon Volume 2
def find_ocr_folders(directory):
cache_file = 'ocr_folders.txt'
subfolders_with_name = []
if os.path.exists(cache_file):
with open(cache_file, 'r') as file:
subfolders_with_name = [line.strip() for line in file.readlines()]
else:
for dirpath, dirnames, filenames in os.walk(directory):
if '_ocr' in dirnames:
subfolders_with_name.append(os.path.join(dirpath, '_ocr'))
with open(cache_file,'w+') as file:
file.writelines('\n'.join(subfolders_with_name))
return subfolders_with_name
def list_subdirectories(directory):
subdirectories = []
for entry in os.listdir(directory):
entry_path = os.path.join(directory, entry)
if os.path.isdir(entry_path):
subdirectories.append(entry_path)
return subdirectories
def get_parent_folder_name(path):
parent_folder = os.path.basename(os.path.dirname(path))
return parent_folder
def get_json_files(directory):
json_files = [f for f in os.listdir(directory) if f.endswith(".json")]
return json_files
def read_searchable_text_file(volume_directory, search_text_file):
json_file_names = natsorted(get_json_files(volume_directory))
if not os.path.isfile(search_text_file):
return create_searchable_text_file(volume_directory, search_text_file)
else:
with open(search_text_file, 'r', encoding='utf-8') as file:
return file.read()
def create_searchable_text_file(volume_directory, search_text_file):
json_file_names = natsorted(get_json_files(volume_directory))
output_texts = []
if not os.path.isfile(search_text_file) or True:
for json_file_name in json_file_names:
json_file = os.path.join(volume_directory, json_file_name)
with open(json_file, 'r', encoding='utf-8') as file:
data = json.load(file)
if not 'blocks' in data:
continue
for block in data['blocks']:
if not 'lines' in block:
continue
output_texts.append(f"{os.path.splitext(json_file_name)[0]}\t{''.join(block['lines'])}")
output = '\n'.join(output_texts)
with open(search_text_file, 'w', encoding='utf-8') as file:
file.write(output)
return output
def get_image_file_path(volume_directory, image_base_name):
# Get the path to the original image.
path_list = volume_directory.split(os.path.sep)
path_list.remove('_ocr')
image_directory = os.path.sep + os.path.join(*path_list)
image_without_extension = os.path.join(image_directory, image_base_name)
common_extensions = ['.jpg', '.jpeg', '.png']
for extension in common_extensions:
possible_file = image_without_extension + extension
if os.path.isfile(possible_file):
return possible_file
return ''
def html_for_match(match, volume_directory, image_base_name):
image_file_path = get_image_file_path(volume_directory, image_base_name)
line_with_style = line_text[:match.start()] + '<strong>' + match.group() + '</strong>' + line_text[match.end():]
return f"<li tabindex='0' onfocus='showImage(this, \"{image_file_path}\")'>{image_base_name}: {line_with_style}</li>"
# Get a list of all _ocr folders.
ocr_folders = natsorted(find_ocr_folders(manga_folder))
output_lines = []
output_lines.append('<link rel="stylesheet" href="styles.css">')
output_lines.append('<script src="script.js"></script>')
output_lines.append('<div id="parent">')
output_lines.append('<div id="matches">')
for ocr_folder in ocr_folders:
series_name = get_parent_folder_name(ocr_folder)
match_found_for_series = False
volume_directories = natsorted(list_subdirectories(ocr_folder))
for volume_directory in volume_directories:
volume_name = os.path.basename(volume_directory)
match_found_for_volume = False
search_text_file = os.path.join(volume_directory, f'{volume_name} Searchable Text.txt')
text = read_searchable_text_file(volume_directory, search_text_file)
lines = text.strip().split('\n')
for line in lines:
image_base_name, line_text = line.split('\t')
match = re.search(search, line_text)
if match:
if not match_found_for_series:
output_lines.append(f'<h2>{series_name}</h2>')
match_found_for_series = True
if not match_found_for_volume:
output_lines.append(f'<h3>{volume_name}</h3>')
output_lines.append('<ul>')
match_found_for_volume = True
output_lines.append(html_for_match(match, volume_directory, image_base_name))
if match_found_for_volume:
output_lines.append('</ul>')
output_lines.append('</div>')
output_lines.append('<div id="page">')
output_lines.append('<a id="link" target="_blank"><img id="image" /></a>')
output_lines.append('</div>')
output_lines.append('</div>')
output_file = 'results.html'
with open(output_file, 'w', encoding='utf-8') as file:
file.write('\n'.join(output_lines))
Additional Files
This uses the script.js and style.css files from the original text search post.
Requirements
- Python
- natsort
You can install natsort
from pip:
pip install natsort
On my setup, this line would be:
pip3.10 install natsort
I imagine there’s a variance for Windows.
Setup and Usage
Modify these two variables in the file to point to where you keep your manga series folders and what you want to search for (regular expression supported):
manga_folder = '/home/chris/Books/Comics/Japanese/'
search = "ありがとうご"
Run the file from a command-line prompt:
python search.py
The first run may take a bit of time while it locates all the _ocr folders. (If subsequent runs take too long, let me know and I can look into caching the _ocr folder list.)
Once the search completes, it will generate a file called results.html
.
Caveats
Filesystem Structure
This expects that you have your manga images in per-volume folders, and that the volume folders are in per-series folders, such as:
Windows Support
I haven’t tried it out on Windows yet, but I wrote the code to be filesystem agnostic.