Source code for cheesechaser.datapool.nhentai

"""
This module provides data pool classes for managing and accessing NHentai manga and image data.

The module includes two main classes:

1. NHentaiImagesDataPool: A data pool for managing NHentai images.
2. NHentaiMangaDataPool: A data pool for managing NHentai manga data, including image associations.

These classes provide functionality for retrieving manga information, downloading images,
and managing resources from a Hugging Face dataset repository.
"""

import json
import logging
import os.path
import shutil
from contextlib import contextmanager
from functools import lru_cache
from threading import Lock
from typing import ContextManager, Tuple, Any

import pandas as pd
from hbutils.system import TemporaryDirectory
from huggingface_hub.utils import LocalEntryNotFoundError

from .base import IncrementIDDataPool, DataPool, ResourceNotFoundError
from ..utils import get_hf_client

_DATA_REPO = 'deepghs/nhentai_full'


[docs]class NHentaiImagesDataPool(IncrementIDDataPool):
    """
    A data pool class for managing NHentai images.

    This class extends the IncrementIDDataPool to provide specific functionality
    for handling NHentai image data.

    :param revision: The revision of the data to use, defaults to 'main'.
    :type revision: str
    """

[docs]    def __init__(self, revision: str = 'main'):
        """
        Initialize the NHentaiImagesDataPool.

        :param revision: The revision of the data to use, defaults to 'main'.
        :type revision: str
        """
        IncrementIDDataPool.__init__(
            self,
            data_repo_id=_DATA_REPO,
            data_revision=revision,
            idx_repo_id=_DATA_REPO,
            idx_revision=revision,
            base_level=4,
        )


[docs]class NHentaiMangaDataPool(DataPool):
    """
    A data pool class for managing NHentai manga data.

    This class provides methods for retrieving manga information, downloading associated images,
    and managing manga resources.

    :param revision: The revision of the data to use, defaults to 'main'.
    :type revision: str
    """

    __data_lock__ = Lock()

[docs]    def __init__(self, revision: str = 'main'):
        """
        Initialize the NHentaiMangaDataPool.

        :param revision: The revision of the data to use, defaults to 'main'.
        :type revision: str
        """
        self.revision = revision
        self.images_pool = NHentaiImagesDataPool(revision=revision)

[docs]    @classmethod
    @lru_cache()
    def manga_id_map(cls, revision: str = 'main', local_files_prefer: bool = True):
        """
        Get a mapping of manga IDs to their associated image IDs.

        This method is cached for efficiency.

        :param revision: The revision of the data to use, defaults to 'main'.
        :type revision: str
        :param local_files_prefer: Whether to prefer local files, defaults to True.
        :type local_files_prefer: bool
        :return: A dictionary mapping manga IDs to lists of image IDs.
        :rtype: dict
        """
        df = cls.manga_posts_table(revision, local_files_prefer)
        return {
            item['id']: json.loads(item['image_ids'])
            for item in df.to_dict('records')
        }

[docs]    @classmethod
    @lru_cache()
    def manga_posts_table(cls, revision: str = 'main', local_files_prefer: bool = True):
        """
        Retrieve the manga posts table as a pandas DataFrame.

        This method is cached for efficiency.

        :param revision: The revision of the data to use, defaults to 'main'.
        :type revision: str
        :param local_files_prefer: Whether to prefer local files, defaults to True.
        :type local_files_prefer: bool
        :return: A pandas DataFrame containing manga post information.
        :rtype: pandas.DataFrame
        """
        client = get_hf_client()
        try:
            csv_file = client.hf_hub_download(
                repo_id=_DATA_REPO,
                repo_type='dataset',
                revision=revision,
                filename='posts.csv',
                local_files_only=True if local_files_prefer else False,
            )
        except LocalEntryNotFoundError:
            csv_file = client.hf_hub_download(
                repo_id=_DATA_REPO,
                repo_type='dataset',
                revision=revision,
                filename='posts.csv',
                local_files_only=False,
            )

        return pd.read_csv(csv_file)

[docs]    @contextmanager
    def mock_resource(self, resource_id, resource_info) -> ContextManager[Tuple[str, Any]]:
        """
        Create a mock resource for a given manga.

        This method downloads the associated images for a manga and organizes them
        in a temporary directory.

        :param resource_id: The ID of the manga resource.
        :type resource_id: int
        :param resource_info: Additional information about the resource.
        :type resource_info: Any
        :yield: A tuple containing the path to the temporary directory with the images and the resource info.
        :rtype: Tuple[str, Any]
        :raises ResourceNotFoundError: If the specified manga resource is not found.
        """
        with self.__data_lock__:
            maps = self.manga_id_map(self.revision, local_files_prefer=True)
        if resource_id not in maps:
            raise ResourceNotFoundError(f'Manga {resource_id!r} not found.')

        with TemporaryDirectory() as td:
            origin_dir = os.path.join(td, 'origin')
            os.makedirs(origin_dir, exist_ok=True)
            image_ids = maps[resource_id]
            logging.info(f'Images {image_ids!r} found for manga resource {resource_id}.')
            self.images_pool.batch_download_to_directory(
                image_ids, origin_dir,
                save_metainfo=False,
            )
            files = {}
            for src_image_file in os.listdir(origin_dir):
                body, _ = os.path.splitext(os.path.basename(src_image_file))
                files[int(body)] = src_image_file

            dst_dir = os.path.join(td, 'dst')
            os.makedirs(dst_dir, exist_ok=True)
            missing_ids = []
            for i, image_id in enumerate(image_ids, start=1):
                if image_id in files:
                    src_file = os.path.join(origin_dir, files[image_id])
                    _, ext = os.path.splitext(src_file)
                    dst_file = os.path.join(dst_dir, f'{resource_id}_p{i}{ext}')
                    shutil.move(src_file, dst_file)
                else:
                    missing_ids.append(i)

            if missing_ids:
                logging.info(f'Image {missing_ids!r} not found for resource {resource_id!r}.')
            yield dst_dir, resource_info