Skip to content

Schema

Dataset

Bases: BaseModel

Contains Dataset parameters, including input file path, indexes for state management (e.g. job batching and retries), and a unique ID

Source code in src/autolabel/schema.py
class Dataset(BaseModel):
    """Contains Dataset parameters, including input file path, indexes for state management (e.g. job batching and retries), and a unique ID"""

    id: str
    input_file: str
    start_index: int
    end_index: int

    class Config:
        orm_mode = True

    @classmethod
    def create_id(
        self,
        dataset: Union[str, pd.DataFrame],
        config: AutolabelConfig,
        start_index: int,
        max_items: int,
    ) -> str:
        """
        Generates a unique ID for the given Dataset configuration
        Args:
            dataset: either 1) input file name or 2) pandas Dataframe
            config:  AutolabelConfig object containing project settings
            start_index: index to begin labeling job at (used for job batching, retries, state management)
            max_items: number of data points to label, beginning at start_index

        Returns:
            filehash: a unique ID generated from an MD5 hash of the functions parameters
        """
        if isinstance(dataset, str):
            filehash = calculate_md5(
                [open(dataset, "rb"), config._dataset_config, start_index, max_items]
            )
        else:
            filehash = calculate_md5(
                [dataset.to_csv(), config._dataset_config, start_index, max_items]
            )
        return filehash

create_id(dataset, config, start_index, max_items) classmethod

Generates a unique ID for the given Dataset configuration

Parameters:

Name Type Description Default
dataset Union[str, DataFrame]

either 1) input file name or 2) pandas Dataframe

required
config AutolabelConfig

AutolabelConfig object containing project settings

required
start_index int

index to begin labeling job at (used for job batching, retries, state management)

required
max_items int

number of data points to label, beginning at start_index

required

Returns:

Name Type Description
filehash str

a unique ID generated from an MD5 hash of the functions parameters

Source code in src/autolabel/schema.py
@classmethod
def create_id(
    self,
    dataset: Union[str, pd.DataFrame],
    config: AutolabelConfig,
    start_index: int,
    max_items: int,
) -> str:
    """
    Generates a unique ID for the given Dataset configuration
    Args:
        dataset: either 1) input file name or 2) pandas Dataframe
        config:  AutolabelConfig object containing project settings
        start_index: index to begin labeling job at (used for job batching, retries, state management)
        max_items: number of data points to label, beginning at start_index

    Returns:
        filehash: a unique ID generated from an MD5 hash of the functions parameters
    """
    if isinstance(dataset, str):
        filehash = calculate_md5(
            [open(dataset, "rb"), config._dataset_config, start_index, max_items]
        )
    else:
        filehash = calculate_md5(
            [dataset.to_csv(), config._dataset_config, start_index, max_items]
        )
    return filehash

ErrorType

Bases: str, Enum

Enum of supported error types

Source code in src/autolabel/schema.py
class ErrorType(str, Enum):
    """Enum of supported error types"""

    LLM_PROVIDER_ERROR = "llm_provider_error"
    PARSING_ERROR = "parsing_error"
    OUTPUT_GUIDELINES_NOT_FOLLOWED_ERROR = "output_guidelines_not_followed_error"
    EMPTY_RESPONSE_ERROR = "empty_response_error"

FewShotAlgorithm

Bases: str, Enum

Enum of supported algorithms for choosing which examples to provide the LLM in its instruction prompt

Source code in src/autolabel/schema.py
class FewShotAlgorithm(str, Enum):
    """Enum of supported algorithms for choosing which examples to provide the LLM in its instruction prompt"""

    FIXED = "fixed"
    SEMANTIC_SIMILARITY = "semantic_similarity"
    MAX_MARGINAL_RELEVANCE = "max_marginal_relevance"
    LABEL_DIVERSITY_RANDOM = "label_diversity_random"
    LABEL_DIVERSITY_SIMILARITY = "label_diversity_similarity"

LLMAnnotation

Bases: BaseModel

Contains label information of a given data point, including the generated label, the prompt given to the LLM, and the LLMs response. Optionally includes a confidence_score if supported by the model

Source code in src/autolabel/schema.py
class LLMAnnotation(BaseModel):
    """Contains label information of a given data point, including the generated label, the prompt given to the LLM, and the LLMs response. Optionally includes a confidence_score if supported by the model"""

    successfully_labeled: bool
    label: Any
    curr_sample: Optional[bytes] = ""
    confidence_score: Optional[float] = None
    generation_info: Optional[Dict[str, Any]] = None
    raw_response: Optional[str] = ""
    explanation: Optional[str] = ""
    prompt: Optional[str] = ""
    error: Optional[LabelingError] = None

LabelingError

Bases: BaseModel

Contains information about an error that occurred during the labeling process

Source code in src/autolabel/schema.py
class LabelingError(BaseModel):
    """Contains information about an error that occurred during the labeling process"""

    error_type: ErrorType
    error_message: str

MetricResult

Bases: BaseModel

Contains performance metrics gathered from autolabeler runs

Source code in src/autolabel/schema.py
class MetricResult(BaseModel):
    """Contains performance metrics gathered from autolabeler runs"""

    name: str
    value: Any
    show_running: Optional[bool] = True

MetricType

Bases: str, Enum

Enum of supported performance metrics. Some metrics are always available (task agnostic), while others are only supported by certain types of tasks

Source code in src/autolabel/schema.py
class MetricType(str, Enum):
    """Enum of supported performance metrics. Some metrics are always available (task agnostic), while others are only supported by certain types of tasks"""

    # Task agnostic
    SUPPORT = "support"
    COMPLETION_RATE = "completion_rate"
    # Classification metrics
    ACCURACY = "accuracy"
    CONFUSION_MATRIX = "confusion_matrix"
    LABEL_DISTRIBUTION = "label_distribution"
    F1 = "f1"
    F1_MICRO = "f1_micro"
    F1_MACRO = "f1_macro"
    F1_WEIGHTED = "f1_weighted"
    TEXT_PARTIAL_MATCH = "text_partial_match"
    # Confidence metrics
    AUROC = "auroc"
    THRESHOLD = "threshold"

    # Aggregate Metrics
    CLASSIFICATION_REPORT = "classification_report"

ModelProvider

Bases: str, Enum

Enum containing all LLM providers currently supported by autolabeler

Source code in src/autolabel/schema.py
class ModelProvider(str, Enum):
    """Enum containing all LLM providers currently supported by autolabeler"""

    OPENAI = "openai"
    ANTHROPIC = "anthropic"
    HUGGINGFACE_PIPELINE = "huggingface_pipeline"
    REFUEL = "refuel"
    GOOGLE = "google"
    COHERE = "cohere"
    CUSTOM = "custom"

RefuelLLMResult

Bases: BaseModel

List of generated outputs. This is a List[List[]] because each input could have multiple candidate generations.

Source code in src/autolabel/schema.py
class RefuelLLMResult(BaseModel):
    """List of generated outputs. This is a List[List[]] because
    each input could have multiple candidate generations."""

    generations: List[List[Generation]]

    """Errors encountered while running the labeling job"""
    errors: List[Optional[LabelingError]]

    """Costs incurred during the labeling job"""
    costs: Optional[List[float]] = []

errors: List[Optional[LabelingError]] instance-attribute

Costs incurred during the labeling job

generations: List[List[Generation]] instance-attribute

Errors encountered while running the labeling job

TaskType

Bases: str, Enum

Enum containing all the types of tasks that autolabeler currently supports

Source code in src/autolabel/schema.py
class TaskType(str, Enum):
    """Enum containing all the types of tasks that autolabeler currently supports"""

    CLASSIFICATION = "classification"
    NAMED_ENTITY_RECOGNITION = "named_entity_recognition"
    QUESTION_ANSWERING = "question_answering"
    ENTITY_MATCHING = "entity_matching"
    MULTILABEL_CLASSIFICATION = "multilabel_classification"