Source code for cheesechaser.datapool.nhentai

"""
This module provides data pool classes for managing and accessing NHentai manga and image data.

The module includes two main classes:

1. NHentaiImagesDataPool: A data pool for managing NHentai images.
2. NHentaiMangaDataPool: A data pool for managing NHentai manga data, including image associations.

These classes provide functionality for retrieving manga information, downloading images,
and managing resources from a Hugging Face dataset repository.
"""

import json
import logging
import os.path
import shutil
from contextlib import contextmanager
from functools import lru_cache
from threading import Lock
from typing import ContextManager, Tuple, Any

import pandas as pd
from hbutils.system import TemporaryDirectory
from huggingface_hub.utils import LocalEntryNotFoundError

from .base import IncrementIDDataPool, DataPool, ResourceNotFoundError
from ..utils import get_hf_client

_DATA_REPO = 'deepghs/nhentai_full'


[docs]class NHentaiImagesDataPool(IncrementIDDataPool): """ A data pool class for managing NHentai images. This class extends the IncrementIDDataPool to provide specific functionality for handling NHentai image data. :param revision: The revision of the data to use, defaults to 'main'. :type revision: str """
[docs] def __init__(self, revision: str = 'main'): """ Initialize the NHentaiImagesDataPool. :param revision: The revision of the data to use, defaults to 'main'. :type revision: str """ IncrementIDDataPool.__init__( self, data_repo_id=_DATA_REPO, data_revision=revision, idx_repo_id=_DATA_REPO, idx_revision=revision, base_level=4, )
[docs]class NHentaiMangaDataPool(DataPool): """ A data pool class for managing NHentai manga data. This class provides methods for retrieving manga information, downloading associated images, and managing manga resources. :param revision: The revision of the data to use, defaults to 'main'. :type revision: str """ __data_lock__ = Lock()
[docs] def __init__(self, revision: str = 'main'): """ Initialize the NHentaiMangaDataPool. :param revision: The revision of the data to use, defaults to 'main'. :type revision: str """ self.revision = revision self.images_pool = NHentaiImagesDataPool(revision=revision)
[docs] @classmethod @lru_cache() def manga_id_map(cls, revision: str = 'main', local_files_prefer: bool = True): """ Get a mapping of manga IDs to their associated image IDs. This method is cached for efficiency. :param revision: The revision of the data to use, defaults to 'main'. :type revision: str :param local_files_prefer: Whether to prefer local files, defaults to True. :type local_files_prefer: bool :return: A dictionary mapping manga IDs to lists of image IDs. :rtype: dict """ df = cls.manga_posts_table(revision, local_files_prefer) return { item['id']: json.loads(item['image_ids']) for item in df.to_dict('records') }
[docs] @classmethod @lru_cache() def manga_posts_table(cls, revision: str = 'main', local_files_prefer: bool = True): """ Retrieve the manga posts table as a pandas DataFrame. This method is cached for efficiency. :param revision: The revision of the data to use, defaults to 'main'. :type revision: str :param local_files_prefer: Whether to prefer local files, defaults to True. :type local_files_prefer: bool :return: A pandas DataFrame containing manga post information. :rtype: pandas.DataFrame """ client = get_hf_client() try: csv_file = client.hf_hub_download( repo_id=_DATA_REPO, repo_type='dataset', revision=revision, filename='posts.csv', local_files_only=True if local_files_prefer else False, ) except LocalEntryNotFoundError: csv_file = client.hf_hub_download( repo_id=_DATA_REPO, repo_type='dataset', revision=revision, filename='posts.csv', local_files_only=False, ) return pd.read_csv(csv_file)
[docs] @contextmanager def mock_resource(self, resource_id, resource_info) -> ContextManager[Tuple[str, Any]]: """ Create a mock resource for a given manga. This method downloads the associated images for a manga and organizes them in a temporary directory. :param resource_id: The ID of the manga resource. :type resource_id: int :param resource_info: Additional information about the resource. :type resource_info: Any :yield: A tuple containing the path to the temporary directory with the images and the resource info. :rtype: Tuple[str, Any] :raises ResourceNotFoundError: If the specified manga resource is not found. """ with self.__data_lock__: maps = self.manga_id_map(self.revision, local_files_prefer=True) if resource_id not in maps: raise ResourceNotFoundError(f'Manga {resource_id!r} not found.') with TemporaryDirectory() as td: origin_dir = os.path.join(td, 'origin') os.makedirs(origin_dir, exist_ok=True) image_ids = maps[resource_id] logging.info(f'Images {image_ids!r} found for manga resource {resource_id}.') self.images_pool.batch_download_to_directory( image_ids, origin_dir, save_metainfo=False, ) files = {} for src_image_file in os.listdir(origin_dir): body, _ = os.path.splitext(os.path.basename(src_image_file)) files[int(body)] = src_image_file dst_dir = os.path.join(td, 'dst') os.makedirs(dst_dir, exist_ok=True) missing_ids = [] for i, image_id in enumerate(image_ids, start=1): if image_id in files: src_file = os.path.join(origin_dir, files[image_id]) _, ext = os.path.splitext(src_file) dst_file = os.path.join(dst_dir, f'{resource_id}_p{i}{ext}') shutil.move(src_file, dst_file) else: missing_ids.append(i) if missing_ids: logging.info(f'Image {missing_ids!r} not found for resource {resource_id!r}.') yield dst_dir, resource_info