Source code for simba.third_party_label_appenders.transform.labelme_to_df

import json
import os
from datetime import datetime
from typing import Optional, Tuple, Union

import pandas as pd

try:
    from typing import Literal
except:
    from typing_extensions import Literal

import numpy as np

from simba.mixins.image_mixin import ImageMixin
from simba.third_party_label_appenders.transform.utils import (
    arr_to_b64, b64_to_arr, normalize_img_dict, scale_pose_img_sizes)
from simba.utils.checks import (check_if_dir_exists,
                                check_if_keys_exist_in_dict, check_str,
                                check_valid_boolean, check_valid_tuple)
from simba.utils.printing import SimbaTimer, stdout_success
from simba.utils.read_write import (find_files_of_filetypes_in_directory,
                                    img_array_to_clahe)


[docs]class LabelMe2DataFrame:
    """
    Convert a directory of labelme .json files into a pandas dataframe.

    .. image:: _static/img/simba.third_party_label_appenders.transform.labelme_to_df.LabelMe2DataFrame.webp
       :alt: A folder of LabelMe .json annotations is converted into one pandas DataFrame with a row per image, columns for each labeled point's x/y coordinates, the image_name, and the image stored as a base64 string
       :width: 800
       :align: center

    .. note::
       The images are stores as a 64-bit bytestring under the ``image`` header of the output dataframe.

    :param Union[str, os.PathLike] labelme_dir: Directory with labelme json files.
    :param Optional[bool] greyscale: If True, converts the labelme images to greyscale if in rgb format. Default: False.
    :param Optional[bool] pad: If True, checks if all images are the same size and if not; pads the images with black border so all images are the same size.
    :param Union[Literal['min', 'max'], Tuple[int, int]] size: The size of the output images. Can be the smallesgt (min) the largest (max) or a tuple with the width and height of the images. Automatically corrects the labels to account for the image size.
    :param Optional[bool] normalize: If true, normalizes the images. Default: False.
    :param Optional[Union[str, os.PathLike]] save_path: The location where to store the dataframe. If None, then returns the dataframe. Default: None.

    :rtype: Union[None, pd.DataFrame]

    :example I:

    >>> LABELME_DIR = r'C:\troubleshooting\coco_data\labels\test_2'
    >>> runner = LabelMe2DataFrame(labelme_dir=LABELME_DIR)
    >>> runner.run()

    :example II:

    >>> LABELME_DIR = r'C:\troubleshooting\coco_data\labels\test_2'
    >>> runner = LabelMe2DataFrame(labelme_dir=LABELME_DIR, greyscale=True, pad=True, normalize=True, size='min')
    >>> runner.run()

    """

    def __init__(self,
                 labelme_dir: Union[str, os.PathLike],
                 greyscale: Optional[bool] = False,
                 clahe: bool = False,
                 pad: Optional[bool] = False,
                 size: Union[Literal['min', 'max'], Tuple[int, int]] = None,
                 normalize: bool = False,
                 save_path: Optional[Union[str, os.PathLike]] = None,
                 verbose: bool = True):

        if save_path is not None:
            if os.path.isdir(save_path):
                save_path = os.path.join(save_path, f'labelme_data_{datetime.now().strftime("%Y%m%d%H%M%S")}.csv')
        save_path = save_path
        if size is not None:
            if isinstance(size, tuple):
                check_valid_tuple(source=f'{self.__class__.__name__} size', accepted_lengths=(2,), valid_dtypes=(int,), x=size)
            elif isinstance(size, str):
                check_str(name=f'{self.__class__.__name__} size', value=size, options=('min', 'max',), raise_error=True)
        check_if_dir_exists(in_dir=labelme_dir)
        check_valid_boolean(value=greyscale, source=f'{self.__class__.__name__} greyscale', raise_error=True)
        check_valid_boolean(value=pad, source=f'{self.__class__.__name__} pad', raise_error=True)
        check_valid_boolean(value=normalize, source=f'{self.__class__.__name__} normalize', raise_error=True)
        check_valid_boolean(value=clahe, source=f'{self.__class__.__name__} clahe', raise_error=True)
        check_valid_boolean(value=verbose, source=f'{self.__class__.__name__} verbose', raise_error=True)
        self.annotation_paths = find_files_of_filetypes_in_directory(directory=labelme_dir, extensions=['.json'], raise_error=True)
        self.greyscale, self.pad, self.size = greyscale, pad, size
        self.normalize, self.verbose, self.clahe = normalize, verbose, clahe
        self.labelme_dir, self.save_path = labelme_dir, save_path


    def run(self):
        timer = SimbaTimer(start=True)
        images, annotations = {}, []
        for cnt, annot_path in enumerate(self.annotation_paths):
            if self.verbose:
                print(f'Reading annotation file {cnt + 1}/{len(self.annotation_paths)}...')
            with open(annot_path) as f:
                annot_data = json.load(f)
            check_if_keys_exist_in_dict(data=annot_data, key=['shapes', 'imageData'], name=annot_path)
            img_name = annot_data['imagePath']
            images[img_name] = b64_to_arr(annot_data['imageData'])
            if self.greyscale:
                images[img_name] = ImageMixin.img_to_greyscale(img=images[img_name])
            if self.clahe:
                images[img_name] = img_array_to_clahe(img=images[img_name])
            img_data = {}
            for bp_data in annot_data['shapes']:
                check_if_keys_exist_in_dict(data=bp_data, key=['label', 'points'], name=annot_path)
                point_x, point_y = bp_data['points'][0], bp_data['points'][1]
                lbl = bp_data['label']
                img_data[f'{lbl}_x'], img_data[f'{lbl}_y'] = point_x, point_y
            img_data['image_name'] = img_name
            annotations.append(pd.DataFrame.from_dict(img_data, orient='index').T)
        if self.pad:
            if self.verbose: print('Padding images...')
            images = ImageMixin.pad_img_stack(image_dict=images)
        if self.normalize:
            if self.verbose: print('Normalizing images...')
            images = normalize_img_dict(img_dict=images)
        img_lst = []
        for k, v in images.items():
            img_lst.append(arr_to_b64(v))
        out = pd.concat(annotations).reset_index(drop=True)
        out['image'] = img_lst
        if self.size is not None:
            if self.verbose: print('Resizing images...')
            pose_data = out.drop(['image', 'image_name'], axis=1)
            pose_data_arr = pose_data.values.reshape(-1, int(pose_data.shape[1] / 2), 2).astype(np.float32)
            new_pose, out['image'] = scale_pose_img_sizes(pose_data=pose_data_arr, imgs=list(out['image']), size=self.size)
            new_pose = new_pose.reshape(pose_data.shape[0], pose_data.shape[1])
            out.iloc[:, : new_pose.shape[1]] = new_pose
        if self.save_path is None:
            return out
        else:
            if self.verbose: print('Saving CSV file...')
            out.to_csv(self.save_path)
            timer.stop_timer()
            if self.verbose:
                stdout_success(msg=f'Labelme CSV file saved at {self.save_path}', elapsed_time=timer.elapsed_time_str, source=self.__class__.__name__)


# LABELME_DIR = r'C:\troubleshooting\coco_data\labels\test_2'
# runner = LabelMe2DataFrame(labelme_dir=LABELME_DIR)
#
#
# #runner.run()
#
# LABELME_DIR = r'C:\troubleshooting\coco_data\labels\test_2'
# runner = LabelMe2DataFrame(labelme_dir=LABELME_DIR, greyscale=True, pad=True, normalize=True, size='max', save_path=r'D:\imgs')
# runner.run()
#labelme_to_df(labelme_dir=r'C:\troubleshooting\coco_data\labels\test_2').run()
#    >>> labelme_to_df(labelme_dir=r'C:\troubleshooting\coco_data\labels\test_read', greyscale=False, pad=False, normalize=False, size='min').run()