Skip to content

unitorch.models.grounding_dino¤

GroundingDinoProcessor¤

Bases: HfTextClassificationProcessor, HfImageClassificationProcessor

Initializes the GroundingDinoProcessor.

Parameters:

Name Type Description Default
vocab_path str

Path to the BERT vocabulary file.

required
vision_config_path str

Path to the GroundingDINO image processor configuration file.

required
max_seq_length int

Maximum sequence length. Defaults to 128.

128
position_start_id int

Starting position ID. Defaults to 0.

0
Source code in src/unitorch/models/grounding_dino/processing.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def __init__(
    self,
    vocab_path: str,
    vision_config_path: str,
    max_seq_length: Optional[int] = 128,
    position_start_id: Optional[int] = 0,
):
    """
    Initializes the GroundingDinoProcessor.

    Args:
        vocab_path (str): Path to the BERT vocabulary file.
        vision_config_path (str): Path to the GroundingDINO image processor configuration file.
        max_seq_length (int, optional): Maximum sequence length. Defaults to 128.
        position_start_id (int, optional): Starting position ID. Defaults to 0.
    """
    self.bert_tokenizer = BertTokenizer(
        vocab_path, do_basic_tokenize=True, do_lower_case=True
    )
    self.vision_processor = GroundingDinoImageProcessor.from_json_file(
        vision_config_path
    )

    HfTextClassificationProcessor.__init__(
        self,
        tokenizer=self.bert_tokenizer,
        max_seq_length=max_seq_length,
        source_type_id=0,
        target_type_id=1,
        position_start_id=position_start_id,
    )
    HfImageClassificationProcessor.__init__(
        self,
        vision_processor=self.vision_processor,
    )

bert_tokenizer instance-attribute ¤

bert_tokenizer = BertTokenizer(
    vocab_path, do_basic_tokenize=True, do_lower_case=True
)

vision_processor instance-attribute ¤

vision_processor = from_json_file(vision_config_path)

detection ¤

detection(
    text: str,
    image: Union[str, Image],
    bboxes: List[List[float]],
    classes: List[str],
)

Processes image and text for training with ground-truth detections.

Parameters:

Name Type Description Default
text str

Input text describing objects.

required
image str or Image

Input image or path.

required
bboxes List[List[float]]

Ground-truth bounding boxes in [x1, y1, x2, y2] format.

required
classes List[str]

Class name for each bounding box.

required

Returns:

Name Type Description
GenericOutputs

Processed inputs including pixel values, text tokens, boxes, and class IDs.

Source code in src/unitorch/models/grounding_dino/processing.py
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def detection(
    self,
    text: str,
    image: Union[str, Image.Image],
    bboxes: List[List[float]],
    classes: List[str],
):
    """
    Processes image and text for training with ground-truth detections.

    Args:
        text (str): Input text describing objects.
        image (str or PIL.Image.Image): Input image or path.
        bboxes (List[List[float]]): Ground-truth bounding boxes in [x1, y1, x2, y2] format.
        classes (List[str]): Class name for each bounding box.

    Returns:
        GenericOutputs: Processed inputs including pixel values, text tokens, boxes, and class IDs.
    """
    if isinstance(image, str):
        image = Image.open(image).convert("RGB")
    org_w, org_h = image.size

    pixel_outputs = HfImageClassificationProcessor.classification(self, image)
    text_outputs = HfTextClassificationProcessor.classification(self, text)

    bboxes = torch.tensor(bboxes).float()
    bboxes[:, 0] /= org_w
    bboxes[:, 1] /= org_h
    bboxes[:, 2] /= org_w
    bboxes[:, 3] /= org_h

    assert all(c in text for c in classes)
    ground_truth = text_outputs.input_ids.long().tolist()
    class_ids = []
    for c in classes:
        class_tokens = self.tokenizer.tokenize(c)
        class_token_ids = self.tokenizer.convert_tokens_to_ids(class_tokens)
        class_idx = ground_truth.index(class_token_ids[0])
        class_ids.append(class_idx)

    classes = torch.tensor(class_ids)
    if bboxes.dim() == 1:
        bboxes = bboxes.unsqueeze(0)

    assert (
        bboxes.size(-1) == 4 and classes.dim() == 1 and len(classes) == len(bboxes)
    )
    return GenericOutputs(
        pixel_values=pixel_outputs.pixel_values,
        input_ids=text_outputs.input_ids,
        attention_mask=text_outputs.attention_mask,
        token_type_ids=text_outputs.token_type_ids,
        bboxes=bboxes,
        classes=classes,
    )

detection_inputs ¤

detection_inputs(text: str, image: Union[str, Image])

Processes image and text for inference.

Parameters:

Name Type Description Default
text str

Input text describing objects.

required
image str or Image

Input image or path.

required

Returns:

Name Type Description
GenericOutputs

Processed pixel values and text tokens.

Source code in src/unitorch/models/grounding_dino/processing.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def detection_inputs(
    self,
    text: str,
    image: Union[str, Image.Image],
):
    """
    Processes image and text for inference.

    Args:
        text (str): Input text describing objects.
        image (str or PIL.Image.Image): Input image or path.

    Returns:
        GenericOutputs: Processed pixel values and text tokens.
    """
    if isinstance(image, str):
        image = Image.open(image).convert("RGB")
    pixel_outputs = HfImageClassificationProcessor.classification(self, image)
    text_outputs = HfTextClassificationProcessor.classification(self, text)
    return GenericOutputs(
        pixel_values=pixel_outputs.pixel_values,
        input_ids=text_outputs.input_ids,
        attention_mask=text_outputs.attention_mask,
        token_type_ids=text_outputs.token_type_ids,
    )

GroundingDinoForDetection¤

Bases: GenericModel

Initializes the GroundingDinoForDetection model.

Parameters:

Name Type Description Default
config_path str

Path to the GroundingDINO configuration file.

required
Source code in src/unitorch/models/grounding_dino/modeling.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def __init__(
    self,
    config_path: str,
):
    """
    Initializes the GroundingDinoForDetection model.

    Args:
        config_path (str): Path to the GroundingDINO configuration file.
    """
    super().__init__()
    self.config = GroundingDinoConfig.from_json_file(config_path)
    self.model = GroundingDinoModel(self.config)
    _class_embed = GroundingDinoContrastiveEmbedding(self.config)

    if self.config.decoder_bbox_embed_share:
        _bbox_embed = GroundingDinoMLPPredictionHead(
            input_dim=self.config.d_model,
            hidden_dim=self.config.d_model,
            output_dim=4,
            num_layers=3,
        )
        self.bbox_embed = nn.ModuleList(
            [_bbox_embed for _ in range(self.config.decoder_layers)]
        )
    else:
        self.bbox_embed = nn.ModuleList(
            [
                GroundingDinoMLPPredictionHead(
                    input_dim=self.config.d_model,
                    hidden_dim=self.config.d_model,
                    output_dim=4,
                    num_layers=3,
                )
                for _ in range(self.config.decoder_layers)
            ]
        )

    self.class_embed = nn.ModuleList(
        [_class_embed for _ in range(self.config.decoder_layers)]
    )
    self.model.decoder.bbox_embed = self.bbox_embed
    self.model.decoder.class_embed = self.class_embed
    self.init_weights()

    self.enable_auxiliary_loss = self.config.auxiliary_loss
    self.matcher = GroundingDinoHungarianMatcher(
        class_cost=self.config.class_cost,
        bbox_cost=self.config.bbox_cost,
        giou_cost=self.config.giou_cost,
    )
    self.losses = ["labels", "boxes", "cardinality"]
    self.criterion = GroundingDinoLoss(
        matcher=self.matcher,
        num_classes=self.config.num_labels,
        focal_alpha=self.config.focal_alpha,
        losses=self.losses,
    )

replace_keys_in_state_dict class-attribute instance-attribute ¤

replace_keys_in_state_dict = {}

config instance-attribute ¤

config = from_json_file(config_path)

model instance-attribute ¤

model = GroundingDinoModel(config)

bbox_embed instance-attribute ¤

bbox_embed = ModuleList(
    [_bbox_embed for _ in (range(decoder_layers))]
)

class_embed instance-attribute ¤

class_embed = ModuleList(
    [_class_embed for _ in (range(decoder_layers))]
)

enable_auxiliary_loss instance-attribute ¤

enable_auxiliary_loss = auxiliary_loss

matcher instance-attribute ¤

matcher = DeformableDetrHungarianMatcher(
    class_cost=class_cost,
    bbox_cost=bbox_cost,
    giou_cost=giou_cost,
)

losses instance-attribute ¤

losses = ['labels', 'boxes', 'cardinality']

criterion instance-attribute ¤

criterion = DeformableDetrImageLoss(
    matcher=matcher,
    num_classes=num_labels,
    focal_alpha=focal_alpha,
    losses=losses,
)

dtype property ¤

dtype

device property ¤

device

_set_aux_loss ¤

_set_aux_loss(
    outputs_class: Tensor, outputs_coord: Tensor
) -> List[Dict]
Source code in src/unitorch/models/grounding_dino/modeling.py
116
117
118
119
120
121
122
123
124
def _set_aux_loss(
    self,
    outputs_class: torch.Tensor,
    outputs_coord: torch.Tensor,
) -> List[Dict]:
    return [
        {"logits": a, "pred_boxes": b}
        for a, b in zip(outputs_class[:-1], outputs_coord[:-1])
    ]

_decode_outputs ¤

_decode_outputs(
    hidden_states: Tensor,
    enc_text_hidden_state: Tensor,
    init_reference_points: Tensor,
    inter_references_points: Tensor,
    attention_mask: Tensor,
)

Decodes model outputs into class and coordinate predictions.

Source code in src/unitorch/models/grounding_dino/modeling.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def _decode_outputs(
    self,
    hidden_states: torch.Tensor,
    enc_text_hidden_state: torch.Tensor,
    init_reference_points: torch.Tensor,
    inter_references_points: torch.Tensor,
    attention_mask: torch.Tensor,
):
    """Decodes model outputs into class and coordinate predictions."""
    outputs_classes = []
    outputs_coords = []
    num_levels = hidden_states.shape[1]

    for level in range(num_levels):
        reference = (
            init_reference_points
            if level == 0
            else inter_references_points[:, level - 1]
        )
        reference = torch.special.logit(reference, eps=1e-5)

        outputs_class = self.class_embed[level](
            vision_hidden_state=hidden_states[:, level],
            text_hidden_state=enc_text_hidden_state,
            text_token_mask=attention_mask.bool(),
        )
        delta_bbox = self.bbox_embed[level](hidden_states[:, level])

        ref_dim = reference.shape[-1]
        if ref_dim == 4:
            outputs_coord_logits = delta_bbox + reference
        elif ref_dim == 2:
            delta_bbox[..., :2] += reference
            outputs_coord_logits = delta_bbox
        else:
            raise ValueError(
                f"reference.shape[-1] should be 4 or 2, but got {ref_dim}"
            )

        outputs_classes.append(outputs_class)
        outputs_coords.append(outputs_coord_logits.sigmoid())

    return torch.stack(outputs_classes), torch.stack(outputs_coords)

forward ¤

forward(
    pixel_values: Tensor,
    input_ids: Tensor,
    token_type_ids: Tensor,
    attention_mask: Tensor,
    bboxes: Union[List[Tensor], Tensor],
    classes: Union[List[Tensor], Tensor],
)

Forward pass computing detection loss.

Parameters:

Name Type Description Default
pixel_values Tensor

Input image tensor.

required
input_ids Tensor

Text token IDs.

required
token_type_ids Tensor

Token type IDs.

required
attention_mask Tensor

Attention mask.

required
bboxes List[Tensor] or Tensor

Ground-truth boxes in xyxy format.

required
classes List[Tensor] or Tensor

Ground-truth class IDs.

required

Returns:

Type Description

torch.Tensor: Total detection loss.

Source code in src/unitorch/models/grounding_dino/modeling.py
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
def forward(
    self,
    pixel_values: torch.Tensor,
    input_ids: torch.Tensor,
    token_type_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    bboxes: Union[List[torch.Tensor], torch.Tensor],
    classes: Union[List[torch.Tensor], torch.Tensor],
):
    """
    Forward pass computing detection loss.

    Args:
        pixel_values (torch.Tensor): Input image tensor.
        input_ids (torch.Tensor): Text token IDs.
        token_type_ids (torch.Tensor): Token type IDs.
        attention_mask (torch.Tensor): Attention mask.
        bboxes (List[torch.Tensor] or torch.Tensor): Ground-truth boxes in xyxy format.
        classes (List[torch.Tensor] or torch.Tensor): Ground-truth class IDs.

    Returns:
        torch.Tensor: Total detection loss.
    """
    outputs = self.model(
        pixel_values=pixel_values,
        input_ids=input_ids,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    )

    outputs_class, outputs_coord = self._decode_outputs(
        hidden_states=outputs.intermediate_hidden_states,
        enc_text_hidden_state=outputs.encoder_last_hidden_state_text,
        init_reference_points=outputs.init_reference_points,
        inter_references_points=outputs.intermediate_reference_points,
        attention_mask=attention_mask,
    )

    logits = outputs_class[-1]
    pred_boxes = outputs_coord[-1]

    bboxes = [xyxy2xywh(bbox) for bbox in bboxes]
    labels = [{"class_labels": c, "boxes": b} for b, c in zip(bboxes, classes)]

    outputs_loss = {"logits": logits, "pred_boxes": pred_boxes}
    if self.enable_auxiliary_loss:
        outputs_loss["auxiliary_outputs"] = self._set_aux_loss(
            outputs_class, outputs_coord
        )
    if self.config.two_stage:
        outputs_loss["enc_outputs"] = {
            "logits": outputs[-2],
            "pred_boxes": outputs[-1].sigmoid(),
        }

    loss_dict = self.criterion(outputs_loss, labels)
    weight_dict = {
        "loss_ce": 1,
        "loss_bbox": self.config.bbox_loss_coefficient,
        "loss_giou": self.config.giou_loss_coefficient,
    }
    if self.config.auxiliary_loss:
        aux_weight_dict = {}
        for i in range(self.config.decoder_layers - 1):
            aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
        weight_dict.update(aux_weight_dict)

    return sum(
        loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict
    )

detect ¤

detect(
    pixel_values: Tensor,
    input_ids: Tensor,
    token_type_ids: Tensor,
    attention_mask: Tensor,
    norm_bboxes: Optional[bool] = False,
    text_threshold: Optional[float] = 0.25,
    box_threshold: Optional[float] = 0.25,
)

Runs detection inference and returns filtered predictions.

Parameters:

Name Type Description Default
pixel_values Tensor

Input image tensor.

required
input_ids Tensor

Text token IDs.

required
token_type_ids Tensor

Token type IDs.

required
attention_mask Tensor

Attention mask.

required
norm_bboxes bool

Whether to return normalized boxes. Defaults to False.

False
text_threshold float

Threshold for text token scores. Defaults to 0.25.

0.25
box_threshold float

Threshold for box confidence scores. Defaults to 0.25.

0.25

Returns:

Name Type Description
GenericOutputs

Detected bounding boxes, scores, and class IDs.

Source code in src/unitorch/models/grounding_dino/modeling.py
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
@torch.no_grad()
def detect(
    self,
    pixel_values: torch.Tensor,
    input_ids: torch.Tensor,
    token_type_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    norm_bboxes: Optional[bool] = False,
    text_threshold: Optional[float] = 0.25,
    box_threshold: Optional[float] = 0.25,
):
    """
    Runs detection inference and returns filtered predictions.

    Args:
        pixel_values (torch.Tensor): Input image tensor.
        input_ids (torch.Tensor): Text token IDs.
        token_type_ids (torch.Tensor): Token type IDs.
        attention_mask (torch.Tensor): Attention mask.
        norm_bboxes (bool, optional): Whether to return normalized boxes. Defaults to False.
        text_threshold (float, optional): Threshold for text token scores. Defaults to 0.25.
        box_threshold (float, optional): Threshold for box confidence scores. Defaults to 0.25.

    Returns:
        GenericOutputs: Detected bounding boxes, scores, and class IDs.
    """
    h, w = pixel_values.shape[-2:]
    outputs = self.model(
        pixel_values=pixel_values,
        input_ids=input_ids,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    )

    outputs_class, outputs_coord = self._decode_outputs(
        hidden_states=outputs.intermediate_hidden_states,
        enc_text_hidden_state=outputs.encoder_last_hidden_state_text,
        init_reference_points=outputs.init_reference_points,
        inter_references_points=outputs.intermediate_reference_points,
        attention_mask=attention_mask,
    )

    logits = outputs_class[-1]
    pred_boxes = outputs_coord[-1]

    probs = torch.sigmoid(logits)
    scores = torch.max(probs, dim=-1)[0]
    pred_boxes = [xywh2xyxy(bbox) for bbox in pred_boxes]

    if not norm_bboxes:
        bboxes = [b * torch.tensor([w, h, w, h]).to(b) for b in pred_boxes]
    else:
        bboxes = pred_boxes

    class_ids = input_ids.unsqueeze(1).expand(-1, bboxes[0].shape[0], -1)
    max_len = min(input_ids.shape[-1], logits.shape[-1])

    bboxes, scores, classes = list(
        zip(
            *[
                (
                    b[s > box_threshold],
                    s[s > box_threshold],
                    (
                        (p[s > box_threshold] > text_threshold).float()[:, :max_len]
                        * c[s > box_threshold].float()[:, :max_len]
                    ).long(),
                )
                for b, s, p, c in zip(bboxes, scores, probs, class_ids)
            ]
        )
    )

    return GenericOutputs(
        bboxes=list(bboxes),
        scores=list(scores),
        classes=list(classes),
    )