Source code for simba.third_party_label_appenders.transform.labelme_to_df

import json
import os
from datetime import datetime
from typing import Optional, Tuple, Union

import pandas as pd

try:
    from typing import Literal
except:
    from typing_extensions import Literal

import numpy as np

from simba.mixins.image_mixin import ImageMixin
from simba.third_party_label_appenders.transform.utils import (
    arr_to_b64, b64_to_arr, normalize_img_dict, scale_pose_img_sizes)
from simba.utils.checks import (check_if_dir_exists,
                                check_if_keys_exist_in_dict, check_str,
                                check_valid_boolean, check_valid_tuple)
from simba.utils.printing import SimbaTimer, stdout_success
from simba.utils.read_write import (find_files_of_filetypes_in_directory,
                                    img_array_to_clahe)


[docs]class LabelMe2DataFrame: """ Convert a directory of labelme .json files into a pandas dataframe. .. note:: The images are stores as a 64-bit bytestring under the ``image`` header of the output dataframe. :param Union[str, os.PathLike] labelme_dir: Directory with labelme json files. :param Optional[bool] greyscale: If True, converts the labelme images to greyscale if in rgb format. Default: False. :param Optional[bool] pad: If True, checks if all images are the same size and if not; pads the images with black border so all images are the same size. :param Union[Literal['min', 'max'], Tuple[int, int]] size: The size of the output images. Can be the smallesgt (min) the largest (max) or a tuple with the width and height of the images. Automatically corrects the labels to account for the image size. :param Optional[bool] normalize: If true, normalizes the images. Default: False. :param Optional[Union[str, os.PathLike]] save_path: The location where to store the dataframe. If None, then returns the dataframe. Default: None. :rtype: Union[None, pd.DataFrame] :example I: >>> LABELME_DIR = r'C:\troubleshooting\coco_data\labels\test_2' >>> runner = LabelMe2DataFrame(labelme_dir=LABELME_DIR) >>> runner.run() :example II: >>> LABELME_DIR = r'C:\troubleshooting\coco_data\labels\test_2' >>> runner = LabelMe2DataFrame(labelme_dir=LABELME_DIR, greyscale=True, pad=True, normalize=True, size='min') >>> runner.run() """ def __init__(self, labelme_dir: Union[str, os.PathLike], greyscale: Optional[bool] = False, clahe: bool = False, pad: Optional[bool] = False, size: Union[Literal['min', 'max'], Tuple[int, int]] = None, normalize: bool = False, save_path: Optional[Union[str, os.PathLike]] = None, verbose: bool = True): if save_path is not None: if os.path.isdir(save_path): save_path = os.path.join(save_path, f'labelme_data_{datetime.now().strftime("%Y%m%d%H%M%S")}.csv') save_path = save_path if size is not None: if isinstance(size, tuple): check_valid_tuple(source=f'{self.__class__.__name__} size', accepted_lengths=(2,), valid_dtypes=(int,), x=size) elif isinstance(size, str): check_str(name=f'{self.__class__.__name__} size', value=size, options=('min', 'max',), raise_error=True) check_if_dir_exists(in_dir=labelme_dir) check_valid_boolean(value=greyscale, source=f'{self.__class__.__name__} greyscale', raise_error=True) check_valid_boolean(value=pad, source=f'{self.__class__.__name__} pad', raise_error=True) check_valid_boolean(value=normalize, source=f'{self.__class__.__name__} normalize', raise_error=True) check_valid_boolean(value=clahe, source=f'{self.__class__.__name__} clahe', raise_error=True) check_valid_boolean(value=verbose, source=f'{self.__class__.__name__} verbose', raise_error=True) self.annotation_paths = find_files_of_filetypes_in_directory(directory=labelme_dir, extensions=['.json'], raise_error=True) self.greyscale, self.pad, self.size = greyscale, pad, size self.normalize, self.verbose, self.clahe = normalize, verbose, clahe self.labelme_dir, self.save_path = labelme_dir, save_path def run(self): timer = SimbaTimer(start=True) images, annotations = {}, [] for cnt, annot_path in enumerate(self.annotation_paths): if self.verbose: print(f'Reading annotation file {cnt + 1}/{len(self.annotation_paths)}...') with open(annot_path) as f: annot_data = json.load(f) check_if_keys_exist_in_dict(data=annot_data, key=['shapes', 'imageData'], name=annot_path) img_name = annot_data['imagePath'] images[img_name] = b64_to_arr(annot_data['imageData']) if self.greyscale: images[img_name] = ImageMixin.img_to_greyscale(img=images[img_name]) if self.clahe: images[img_name] = img_array_to_clahe(img=images[img_name]) img_data = {} for bp_data in annot_data['shapes']: check_if_keys_exist_in_dict(data=bp_data, key=['label', 'points'], name=annot_path) point_x, point_y = bp_data['points'][0], bp_data['points'][1] lbl = bp_data['label'] img_data[f'{lbl}_x'], img_data[f'{lbl}_y'] = point_x, point_y img_data['image_name'] = img_name annotations.append(pd.DataFrame.from_dict(img_data, orient='index').T) if self.pad: if self.verbose: print('Padding images...') images = ImageMixin.pad_img_stack(image_dict=images) if self.normalize: if self.verbose: print('Normalizing images...') images = normalize_img_dict(img_dict=images) img_lst = [] for k, v in images.items(): img_lst.append(arr_to_b64(v)) out = pd.concat(annotations).reset_index(drop=True) out['image'] = img_lst if self.size is not None: if self.verbose: print('Resizing images...') pose_data = out.drop(['image', 'image_name'], axis=1) pose_data_arr = pose_data.values.reshape(-1, int(pose_data.shape[1] / 2), 2).astype(np.float32) new_pose, out['image'] = scale_pose_img_sizes(pose_data=pose_data_arr, imgs=list(out['image']), size=self.size) new_pose = new_pose.reshape(pose_data.shape[0], pose_data.shape[1]) out.iloc[:, : new_pose.shape[1]] = new_pose if self.save_path is None: return out else: if self.verbose: print('Saving CSV file...') out.to_csv(self.save_path) timer.stop_timer() if self.verbose: stdout_success(msg=f'Labelme CSV file saved at {self.save_path}', elapsed_time=timer.elapsed_time_str, source=self.__class__.__name__)
# LABELME_DIR = r'C:\troubleshooting\coco_data\labels\test_2' # runner = LabelMe2DataFrame(labelme_dir=LABELME_DIR) # # # #runner.run() # # LABELME_DIR = r'C:\troubleshooting\coco_data\labels\test_2' # runner = LabelMe2DataFrame(labelme_dir=LABELME_DIR, greyscale=True, pad=True, normalize=True, size='max', save_path=r'D:\imgs') # runner.run() #labelme_to_df(labelme_dir=r'C:\troubleshooting\coco_data\labels\test_2').run() # >>> labelme_to_df(labelme_dir=r'C:\troubleshooting\coco_data\labels\test_read', greyscale=False, pad=False, normalize=False, size='min').run()