unitorch.models.grounding_dino¤

GroundingDinoProcessor¤

Bases: HfTextClassificationProcessor, HfImageClassificationProcessor

Initializes the GroundingDinoProcessor.

Parameters:

Name	Type	Description	Default
`vocab_path`	`str`	Path to the BERT vocabulary file.	required
`vision_config_path`	`str`	Path to the GroundingDINO image processor configuration file.	required
`max_seq_length`	`int`	Maximum sequence length. Defaults to 128.	`128`
`position_start_id`	`int`	Starting position ID. Defaults to 0.	`0`

Source code in src/unitorch/models/grounding_dino/processing.py

def __init__(
    self,
    vocab_path: str,
    vision_config_path: str,
    max_seq_length: Optional[int] = 128,
    position_start_id: Optional[int] = 0,
):
    """
    Initializes the GroundingDinoProcessor.

    Args:
        vocab_path (str): Path to the BERT vocabulary file.
        vision_config_path (str): Path to the GroundingDINO image processor configuration file.
        max_seq_length (int, optional): Maximum sequence length. Defaults to 128.
        position_start_id (int, optional): Starting position ID. Defaults to 0.
    """
    self.bert_tokenizer = BertTokenizer(
        vocab_path, do_basic_tokenize=True, do_lower_case=True
    )
    self.vision_processor = GroundingDinoImageProcessor.from_json_file(
        vision_config_path
    )

    HfTextClassificationProcessor.__init__(
        self,
        tokenizer=self.bert_tokenizer,
        max_seq_length=max_seq_length,
        source_type_id=0,
        target_type_id=1,
        position_start_id=position_start_id,
    )
    HfImageClassificationProcessor.__init__(
        self,
        vision_processor=self.vision_processor,
    )

bert_tokenizer `instance-attribute` ¤

bert_tokenizer = BertTokenizer(
    vocab_path, do_basic_tokenize=True, do_lower_case=True
)

vision_processor `instance-attribute` ¤

vision_processor = from_json_file(vision_config_path)

detection ¤

detection(
    text: str,
    image: Union[str, Image],
    bboxes: List[List[float]],
    classes: List[str],
)

Processes image and text for training with ground-truth detections.

Parameters:

Name	Type	Description	Default
`text`	`str`	Input text describing objects.	required
`image`	`str or Image`	Input image or path.	required
`bboxes`	`List[List[float]]`	Ground-truth bounding boxes in [x1, y1, x2, y2] format.	required
`classes`	`List[str]`	Class name for each bounding box.	required

Returns:

Name	Type	Description
`GenericOutputs`		Processed inputs including pixel values, text tokens, boxes, and class IDs.

Source code in src/unitorch/models/grounding_dino/processing.py

def detection(
    self,
    text: str,
    image: Union[str, Image.Image],
    bboxes: List[List[float]],
    classes: List[str],
):
    """
    Processes image and text for training with ground-truth detections.

    Args:
        text (str): Input text describing objects.
        image (str or PIL.Image.Image): Input image or path.
        bboxes (List[List[float]]): Ground-truth bounding boxes in [x1, y1, x2, y2] format.
        classes (List[str]): Class name for each bounding box.

    Returns:
        GenericOutputs: Processed inputs including pixel values, text tokens, boxes, and class IDs.
    """
    if isinstance(image, str):
        image = Image.open(image).convert("RGB")
    org_w, org_h = image.size

    pixel_outputs = HfImageClassificationProcessor.classification(self, image)
    text_outputs = HfTextClassificationProcessor.classification(self, text)

    bboxes = torch.tensor(bboxes).float()
    bboxes[:, 0] /= org_w
    bboxes[:, 1] /= org_h
    bboxes[:, 2] /= org_w
    bboxes[:, 3] /= org_h

    assert all(c in text for c in classes)
    ground_truth = text_outputs.input_ids.long().tolist()
    class_ids = []
    for c in classes:
        class_tokens = self.tokenizer.tokenize(c)
        class_token_ids = self.tokenizer.convert_tokens_to_ids(class_tokens)
        class_idx = ground_truth.index(class_token_ids[0])
        class_ids.append(class_idx)

    classes = torch.tensor(class_ids)
    if bboxes.dim() == 1:
        bboxes = bboxes.unsqueeze(0)

    assert (
        bboxes.size(-1) == 4 and classes.dim() == 1 and len(classes) == len(bboxes)
    )
    return GenericOutputs(
        pixel_values=pixel_outputs.pixel_values,
        input_ids=text_outputs.input_ids,
        attention_mask=text_outputs.attention_mask,
        token_type_ids=text_outputs.token_type_ids,
        bboxes=bboxes,
        classes=classes,
    )

detection_inputs ¤

detection_inputs(text: str, image: Union[str, Image])

Processes image and text for inference.

Parameters:

Name	Type	Description	Default
`text`	`str`	Input text describing objects.	required
`image`	`str or Image`	Input image or path.	required

Returns:

Name	Type	Description
`GenericOutputs`		Processed pixel values and text tokens.

Source code in src/unitorch/models/grounding_dino/processing.py

def detection_inputs(
    self,
    text: str,
    image: Union[str, Image.Image],
):
    """
    Processes image and text for inference.

    Args:
        text (str): Input text describing objects.
        image (str or PIL.Image.Image): Input image or path.

    Returns:
        GenericOutputs: Processed pixel values and text tokens.
    """
    if isinstance(image, str):
        image = Image.open(image).convert("RGB")
    pixel_outputs = HfImageClassificationProcessor.classification(self, image)
    text_outputs = HfTextClassificationProcessor.classification(self, text)
    return GenericOutputs(
        pixel_values=pixel_outputs.pixel_values,
        input_ids=text_outputs.input_ids,
        attention_mask=text_outputs.attention_mask,
        token_type_ids=text_outputs.token_type_ids,
    )

GroundingDinoForDetection¤

Bases: GenericModel

Initializes the GroundingDinoForDetection model.

Parameters:

Name	Type	Description	Default
`config_path`	`str`	Path to the GroundingDINO configuration file.	required

Source code in src/unitorch/models/grounding_dino/modeling.py

def __init__(
    self,
    config_path: str,
):
    """
    Initializes the GroundingDinoForDetection model.

    Args:
        config_path (str): Path to the GroundingDINO configuration file.
    """
    super().__init__()
    self.config = GroundingDinoConfig.from_json_file(config_path)
    self.model = GroundingDinoModel(self.config)
    _class_embed = GroundingDinoContrastiveEmbedding(self.config)

    if self.config.decoder_bbox_embed_share:
        _bbox_embed = GroundingDinoMLPPredictionHead(
            input_dim=self.config.d_model,
            hidden_dim=self.config.d_model,
            output_dim=4,
            num_layers=3,
        )
        self.bbox_embed = nn.ModuleList(
            [_bbox_embed for _ in range(self.config.decoder_layers)]
        )
    else:
        self.bbox_embed = nn.ModuleList(
            [
                GroundingDinoMLPPredictionHead(
                    input_dim=self.config.d_model,
                    hidden_dim=self.config.d_model,
                    output_dim=4,
                    num_layers=3,
                )
                for _ in range(self.config.decoder_layers)
            ]
        )

    self.class_embed = nn.ModuleList(
        [_class_embed for _ in range(self.config.decoder_layers)]
    )
    self.model.decoder.bbox_embed = self.bbox_embed
    self.model.decoder.class_embed = self.class_embed
    self.init_weights()

    self.enable_auxiliary_loss = self.config.auxiliary_loss
    self.matcher = GroundingDinoHungarianMatcher(
        class_cost=self.config.class_cost,
        bbox_cost=self.config.bbox_cost,
        giou_cost=self.config.giou_cost,
    )
    self.losses = ["labels", "boxes", "cardinality"]
    self.criterion = GroundingDinoLoss(
        matcher=self.matcher,
        num_classes=self.config.num_labels,
        focal_alpha=self.config.focal_alpha,
        losses=self.losses,
    )

replace_keys_in_state_dict `class-attribute` `instance-attribute` ¤

replace_keys_in_state_dict = {}

config `instance-attribute` ¤

config = from_json_file(config_path)

model `instance-attribute` ¤

model = GroundingDinoModel(config)

bbox_embed `instance-attribute` ¤

bbox_embed = ModuleList(
    [_bbox_embed for _ in (range(decoder_layers))]
)

class_embed `instance-attribute` ¤

class_embed = ModuleList(
    [_class_embed for _ in (range(decoder_layers))]
)

enable_auxiliary_loss `instance-attribute` ¤

enable_auxiliary_loss = auxiliary_loss

matcher `instance-attribute` ¤

matcher = DeformableDetrHungarianMatcher(
    class_cost=class_cost,
    bbox_cost=bbox_cost,
    giou_cost=giou_cost,
)

losses `instance-attribute` ¤

losses = ['labels', 'boxes', 'cardinality']

criterion `instance-attribute` ¤

criterion = DeformableDetrImageLoss(
    matcher=matcher,
    num_classes=num_labels,
    focal_alpha=focal_alpha,
    losses=losses,
)

dtype `property` ¤

dtype

device `property` ¤

device

_set_aux_loss ¤

_set_aux_loss(
    outputs_class: Tensor, outputs_coord: Tensor
) -> List[Dict]

Source code in src/unitorch/models/grounding_dino/modeling.py

def _set_aux_loss(
    self,
    outputs_class: torch.Tensor,
    outputs_coord: torch.Tensor,
) -> List[Dict]:
    return [
        {"logits": a, "pred_boxes": b}
        for a, b in zip(outputs_class[:-1], outputs_coord[:-1])
    ]

_decode_outputs ¤

_decode_outputs(
    hidden_states: Tensor,
    enc_text_hidden_state: Tensor,
    init_reference_points: Tensor,
    inter_references_points: Tensor,
    attention_mask: Tensor,
)

Decodes model outputs into class and coordinate predictions.

Source code in src/unitorch/models/grounding_dino/modeling.py

def _decode_outputs(
    self,
    hidden_states: torch.Tensor,
    enc_text_hidden_state: torch.Tensor,
    init_reference_points: torch.Tensor,
    inter_references_points: torch.Tensor,
    attention_mask: torch.Tensor,
):
    """Decodes model outputs into class and coordinate predictions."""
    outputs_classes = []
    outputs_coords = []
    num_levels = hidden_states.shape[1]

    for level in range(num_levels):
        reference = (
            init_reference_points
            if level == 0
            else inter_references_points[:, level - 1]
        )
        reference = torch.special.logit(reference, eps=1e-5)

        outputs_class = self.class_embed[level](
            vision_hidden_state=hidden_states[:, level],
            text_hidden_state=enc_text_hidden_state,
            text_token_mask=attention_mask.bool(),
        )
        delta_bbox = self.bbox_embed[level](hidden_states[:, level])

        ref_dim = reference.shape[-1]
        if ref_dim == 4:
            outputs_coord_logits = delta_bbox + reference
        elif ref_dim == 2:
            delta_bbox[..., :2] += reference
            outputs_coord_logits = delta_bbox
        else:
            raise ValueError(
                f"reference.shape[-1] should be 4 or 2, but got {ref_dim}"
            )

        outputs_classes.append(outputs_class)
        outputs_coords.append(outputs_coord_logits.sigmoid())

    return torch.stack(outputs_classes), torch.stack(outputs_coords)

forward ¤

forward(
    pixel_values: Tensor,
    input_ids: Tensor,
    token_type_ids: Tensor,
    attention_mask: Tensor,
    bboxes: Union[List[Tensor], Tensor],
    classes: Union[List[Tensor], Tensor],
)

Forward pass computing detection loss.

Parameters:

Name	Type	Description	Default
`pixel_values`	`Tensor`	Input image tensor.	required
`input_ids`	`Tensor`	Text token IDs.	required
`token_type_ids`	`Tensor`	Token type IDs.	required
`attention_mask`	`Tensor`	Attention mask.	required
`bboxes`	`List[Tensor] or Tensor`	Ground-truth boxes in xyxy format.	required
`classes`	`List[Tensor] or Tensor`	Ground-truth class IDs.	required

Returns:

Type	Description
	torch.Tensor: Total detection loss.

Source code in src/unitorch/models/grounding_dino/modeling.py

def forward(
    self,
    pixel_values: torch.Tensor,
    input_ids: torch.Tensor,
    token_type_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    bboxes: Union[List[torch.Tensor], torch.Tensor],
    classes: Union[List[torch.Tensor], torch.Tensor],
):
    """
    Forward pass computing detection loss.

    Args:
        pixel_values (torch.Tensor): Input image tensor.
        input_ids (torch.Tensor): Text token IDs.
        token_type_ids (torch.Tensor): Token type IDs.
        attention_mask (torch.Tensor): Attention mask.
        bboxes (List[torch.Tensor] or torch.Tensor): Ground-truth boxes in xyxy format.
        classes (List[torch.Tensor] or torch.Tensor): Ground-truth class IDs.

    Returns:
        torch.Tensor: Total detection loss.
    """
    outputs = self.model(
        pixel_values=pixel_values,
        input_ids=input_ids,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    )

    outputs_class, outputs_coord = self._decode_outputs(
        hidden_states=outputs.intermediate_hidden_states,
        enc_text_hidden_state=outputs.encoder_last_hidden_state_text,
        init_reference_points=outputs.init_reference_points,
        inter_references_points=outputs.intermediate_reference_points,
        attention_mask=attention_mask,
    )

    logits = outputs_class[-1]
    pred_boxes = outputs_coord[-1]

    bboxes = [xyxy2xywh(bbox) for bbox in bboxes]
    labels = [{"class_labels": c, "boxes": b} for b, c in zip(bboxes, classes)]

    outputs_loss = {"logits": logits, "pred_boxes": pred_boxes}
    if self.enable_auxiliary_loss:
        outputs_loss["auxiliary_outputs"] = self._set_aux_loss(
            outputs_class, outputs_coord
        )
    if self.config.two_stage:
        outputs_loss["enc_outputs"] = {
            "logits": outputs[-2],
            "pred_boxes": outputs[-1].sigmoid(),
        }

    loss_dict = self.criterion(outputs_loss, labels)
    weight_dict = {
        "loss_ce": 1,
        "loss_bbox": self.config.bbox_loss_coefficient,
        "loss_giou": self.config.giou_loss_coefficient,
    }
    if self.config.auxiliary_loss:
        aux_weight_dict = {}
        for i in range(self.config.decoder_layers - 1):
            aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
        weight_dict.update(aux_weight_dict)

    return sum(
        loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict
    )

detect ¤

detect(
    pixel_values: Tensor,
    input_ids: Tensor,
    token_type_ids: Tensor,
    attention_mask: Tensor,
    norm_bboxes: Optional[bool] = False,
    text_threshold: Optional[float] = 0.25,
    box_threshold: Optional[float] = 0.25,
)

Runs detection inference and returns filtered predictions.

Parameters:

Name	Type	Description	Default
`pixel_values`	`Tensor`	Input image tensor.	required
`input_ids`	`Tensor`	Text token IDs.	required
`token_type_ids`	`Tensor`	Token type IDs.	required
`attention_mask`	`Tensor`	Attention mask.	required
`norm_bboxes`	`bool`	Whether to return normalized boxes. Defaults to False.	`False`
`text_threshold`	`float`	Threshold for text token scores. Defaults to 0.25.	`0.25`
`box_threshold`	`float`	Threshold for box confidence scores. Defaults to 0.25.	`0.25`

Returns:

Name	Type	Description
`GenericOutputs`		Detected bounding boxes, scores, and class IDs.

Source code in src/unitorch/models/grounding_dino/modeling.py

@torch.no_grad()
def detect(
    self,
    pixel_values: torch.Tensor,
    input_ids: torch.Tensor,
    token_type_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    norm_bboxes: Optional[bool] = False,
    text_threshold: Optional[float] = 0.25,
    box_threshold: Optional[float] = 0.25,
):
    """
    Runs detection inference and returns filtered predictions.

    Args:
        pixel_values (torch.Tensor): Input image tensor.
        input_ids (torch.Tensor): Text token IDs.
        token_type_ids (torch.Tensor): Token type IDs.
        attention_mask (torch.Tensor): Attention mask.
        norm_bboxes (bool, optional): Whether to return normalized boxes. Defaults to False.
        text_threshold (float, optional): Threshold for text token scores. Defaults to 0.25.
        box_threshold (float, optional): Threshold for box confidence scores. Defaults to 0.25.

    Returns:
        GenericOutputs: Detected bounding boxes, scores, and class IDs.
    """
    h, w = pixel_values.shape[-2:]
    outputs = self.model(
        pixel_values=pixel_values,
        input_ids=input_ids,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    )

    outputs_class, outputs_coord = self._decode_outputs(
        hidden_states=outputs.intermediate_hidden_states,
        enc_text_hidden_state=outputs.encoder_last_hidden_state_text,
        init_reference_points=outputs.init_reference_points,
        inter_references_points=outputs.intermediate_reference_points,
        attention_mask=attention_mask,
    )

    logits = outputs_class[-1]
    pred_boxes = outputs_coord[-1]

    probs = torch.sigmoid(logits)
    scores = torch.max(probs, dim=-1)[0]
    pred_boxes = [xywh2xyxy(bbox) for bbox in pred_boxes]

    if not norm_bboxes:
        bboxes = [b * torch.tensor([w, h, w, h]).to(b) for b in pred_boxes]
    else:
        bboxes = pred_boxes

    class_ids = input_ids.unsqueeze(1).expand(-1, bboxes[0].shape[0], -1)
    max_len = min(input_ids.shape[-1], logits.shape[-1])

    bboxes, scores, classes = list(
        zip(
            *[
                (
                    b[s > box_threshold],
                    s[s > box_threshold],
                    (
                        (p[s > box_threshold] > text_threshold).float()[:, :max_len]
                        * c[s > box_threshold].float()[:, :max_len]
                    ).long(),
                )
                for b, s, p, c in zip(bboxes, scores, probs, class_ids)
            ]
        )
    )

    return GenericOutputs(
        bboxes=list(bboxes),
        scores=list(scores),
        classes=list(classes),
    )

unitorch.models.grounding_dino¤

GroundingDinoProcessor¤

bert_tokenizer instance-attribute ¤

vision_processor instance-attribute ¤

detection ¤

detection_inputs ¤

GroundingDinoForDetection¤

replace_keys_in_state_dict class-attribute instance-attribute ¤

config instance-attribute ¤

model instance-attribute ¤

bbox_embed instance-attribute ¤

class_embed instance-attribute ¤

enable_auxiliary_loss instance-attribute ¤

matcher instance-attribute ¤

losses instance-attribute ¤

criterion instance-attribute ¤

dtype property ¤

device property ¤

_set_aux_loss ¤

_decode_outputs ¤

forward ¤

detect ¤

bert_tokenizer `instance-attribute` ¤

vision_processor `instance-attribute` ¤

replace_keys_in_state_dict `class-attribute` `instance-attribute` ¤

config `instance-attribute` ¤

model `instance-attribute` ¤

bbox_embed `instance-attribute` ¤

class_embed `instance-attribute` ¤

enable_auxiliary_loss `instance-attribute` ¤

matcher `instance-attribute` ¤

losses `instance-attribute` ¤

criterion `instance-attribute` ¤

dtype `property` ¤

device `property` ¤