Source code for simba.labelling.extract_labelling_meta

import os
from typing import Optional, Union

import pandas as pd

from simba.mixins.config_reader import ConfigReader
from simba.utils.checks import (
    check_all_file_names_are_represented_in_video_log,
    check_if_df_field_is_boolean, check_that_column_exist, check_valid_boolean)
from simba.utils.data import detect_bouts
from simba.utils.errors import CountError
from simba.utils.printing import SimbaTimer, stdout_success
from simba.utils.read_write import get_fn_ext, read_df


[docs]class AnnotationMetaDataExtractor(ConfigReader):

    """
    Extract annotation statistics (number of annotated frames, seconds, bouts etc.) for all classifiers in a SimBA project
    to MS Excel format.

    .. note::
       `Example expected output <https://github.com/sgoldenlab/simba/blob/master/misc/ANNOTATION_STATISTICS_20240713132805.xlsx>`__.

    :param Union[str, os.PathLike] config_path: path to SimBA configparser.ConfigParser project_config.ini
    :param Optional[bool] annotated_bouts: If True, includes information on annotated bouts (start and stop time and bout length). Default True.
    :param Optional[bool] split_by_video: If True, includes a worksheet where the annotation counts are split by video. Default True.

    :example:

    >>> annotation_meta_extractor = AnnotationMetaDataExtractor(config_path='/Users/simon/Desktop/envs/simba/troubleshooting/two_black_animals_14bp/project_folder/project_config.ini')
    >>> annotation_meta_extractor.run()
    >>> annotation_meta_extractor.save()

    """
    def __init__(self,
                 config_path: Union[str, os.PathLike],
                 split_by_video: Optional[bool] = True,
                 annotated_bouts: Optional[bool] = True):

        ConfigReader.__init__(self, config_path=config_path, read_video_info=True)
        if len(self.clf_names) == 0:
            raise CountError(msg=f'No classifier names associated with SimBA project {config_path}', source=self.__class__.__name__)
        if len(self.target_file_paths) == 0:
            raise CountError(msg=f'No data files found inside the {self.targets_folder} directory', source=self.__class__.__name__)
        self.save_path = os.path.join(self.logs_path, f'ANNOTATION_STATISTICS_{self.datetime}.xlsx')
        check_valid_boolean(value=[annotated_bouts, split_by_video])
        self.annotated_bouts, self.split_by_video = annotated_bouts, split_by_video

    def run(self):
        check_all_file_names_are_represented_in_video_log(video_info_df=self.video_info_df, data_paths=self.target_file_paths)
        self.results, self.bout_data = {}, []
        print(f'Analyzing annotations in {len(self.target_file_paths)} data file(s)...')
        for file_cnt, file_path in enumerate(self.target_file_paths):
            _, video_name, _ = get_fn_ext(filepath=file_path)
            self.results[video_name] = {}
            _, _, fps = self.read_video_info(video_name=video_name)
            print(f'Analyzing annotations in {video_name }... ')
            df = read_df(file_path=file_path, file_type=self.file_type)
            check_that_column_exist(df=df, column_name=self.clf_names, file_name=file_path)
            bouts = detect_bouts(data_df=df, target_lst=self.clf_names, fps=fps)
            bouts.columns = ['ANNOTATED CLASSIFIER', 'ANNOTATED START TIME', 'ANNOTATED END TIME', 'ANNOTATED START FRAME','ANNOTATED END FRAME','ANNOTATED BOUT TIME (S)']
            bouts['VIDEO'] = video_name
            bouts = bouts[['VIDEO', 'ANNOTATED CLASSIFIER', 'ANNOTATED START TIME', 'ANNOTATED END TIME', 'ANNOTATED START FRAME','ANNOTATED END FRAME','ANNOTATED BOUT TIME (S)']]
            self.bout_data.append(bouts)
            for clf in self.clf_names:
                check_if_df_field_is_boolean(df=df, field=clf, df_name=file_path)
                present_df, absent_df = df[df[clf] == 1], df[df[clf] == 0]
                self.results[video_name][clf] = {f'ANNOTATED PRESENT FRAME COUNT': len(present_df),
                                                 f'ANNOTATED PRESENT TIME (S)': round((len(present_df) / fps), 4),
                                                 f'ANNOTATED ABSENT FRAMES COUNT': len(absent_df),
                                                 f'ANNOTATED ABSENT TIME (S)': round((len(absent_df) / fps), 4)}


    def __aggregates(self):
        self.by_video = pd.DataFrame(columns=['VIDEO', 'CLASSIFIER', 'ANNOTATION MEASUREMENT', 'ANNOTATION STATISTIC'])
        for video_name, video_data in self.results.items():
            for clf in self.clf_names:
                for clf_key, clf_data in self.results[video_name][clf].items():
                    self.by_video.loc[len(self.by_video)] = [video_name, clf,  clf_key, clf_data]
        self.aggregates = pd.DataFrame(self.by_video.drop(['VIDEO'], axis=1).groupby(by=['CLASSIFIER', 'ANNOTATION MEASUREMENT'])['ANNOTATION STATISTIC'].sum())
        self.by_video = self.by_video.set_index(['VIDEO', 'CLASSIFIER'])

    def save(self):
        self.__aggregates()
        self.bout_data = pd.concat(self.bout_data, axis=0)
        with pd.ExcelWriter(self.save_path) as writer:
            self.aggregates.to_excel(writer, sheet_name='TOTAL ANNOTATION COUNTS', index=True)
            if self.split_by_video:
                self.by_video.to_excel(writer, sheet_name='VIDEO ANNOTATION COUNTS', index=True)
            if self.annotated_bouts:
                self.bout_data.to_excel(writer, sheet_name='VIDEO ANNOTATION BOUT DATA', index=False)
        self.timer.stop_timer()
        stdout_success(msg=f'Annotation data for {len(self.target_file_paths)} video(s) saved at {self.save_path}', source=self.__class__.__name__, elapsed_time=self.timer.elapsed_time)


# annotation_meta_extractor = AnnotationMetaDataExtractor(config_path='/Users/simon/Desktop/envs/simba/troubleshooting/two_black_animals_14bp/project_folder/project_config.ini')
# annotation_meta_extractor.run()
# annotation_meta_extractor.save()