@@ -0,0 +1,399 @@
1 |
Attribution-NonCommercial 4.0 International
2 |
3 |
4 |
5 |
Creative Commons Corporation ("Creative Commons") is not a law firm and
6 |
does not provide legal services or legal advice. Distribution of
7 |
Creative Commons public licenses does not create a lawyer-client or
8 |
other relationship. Creative Commons makes its licenses and related
9 |
information available on an "as-is" basis. Creative Commons gives no
10 |
warranties regarding its licenses, any material licensed under their
11 |
terms and conditions, or any related information. Creative Commons
12 |
disclaims all liability for damages resulting from their use to the
13 |
fullest extent possible.
14 |
15 |
Using Creative Commons Public Licenses
16 |
17 |
Creative Commons public licenses provide a standard set of terms and
18 |
conditions that creators and other rights holders may use to share
19 |
original works of authorship and other material subject to copyright
20 |
and certain other rights specified in the public license below. The
21 |
following considerations are for informational purposes only, are not
22 |
exhaustive, and do not form part of our licenses.
23 |
24 |
Considerations for licensors: Our public licenses are
25 |
intended for use by those authorized to give the public
26 |
permission to use material in ways otherwise restricted by
27 |
copyright and certain other rights. Our licenses are
28 |
irrevocable. Licensors should read and understand the terms
29 |
and conditions of the license they choose before applying it.
30 |
Licensors should also secure all rights necessary before
31 |
applying our licenses so that the public can reuse the
32 |
material as expected. Licensors should clearly mark any
33 |
material not subject to the license. This includes other CC-
34 |
licensed material, or material used under an exception or
35 |
limitation to copyright. More considerations for licensors:
36 |
37 |
38 |
Considerations for the public: By using one of our public
39 |
licenses, a licensor grants the public permission to use the
40 |
licensed material under specified terms and conditions. If
41 |
the licensor's permission is not necessary for any reason--for
42 |
example, because of any applicable exception or limitation to
43 |
copyright--then that use is not regulated by the license. Our
44 |
licenses grant only permissions under copyright and certain
45 |
other rights that a licensor has authority to grant. Use of
46 |
the licensed material may still be restricted for other
47 |
reasons, including because others have copyright or other
48 |
rights in the material. A licensor may make special requests,
49 |
such as asking that all changes be marked or described.
50 |
Although not required by our licenses, you are encouraged to
51 |
respect those requests where reasonable. More_considerations
52 |
for the public:
53 |
54 |
55 |
56 |
57 |
Creative Commons Attribution-NonCommercial 4.0 International Public
58 |
59 |
60 |
By exercising the Licensed Rights (defined below), You accept and agree
61 |
to be bound by the terms and conditions of this Creative Commons
62 |
Attribution-NonCommercial 4.0 International Public License ("Public
63 |
License"). To the extent this Public License may be interpreted as a
64 |
contract, You are granted the Licensed Rights in consideration of Your
65 |
acceptance of these terms and conditions, and the Licensor grants You
66 |
such rights in consideration of benefits the Licensor receives from
67 |
making the Licensed Material available under these terms and
68 |
69 |
70 |
Section 1 -- Definitions.
71 |
72 |
a. Adapted Material means material subject to Copyright and Similar
73 |
Rights that is derived from or based upon the Licensed Material
74 |
and in which the Licensed Material is translated, altered,
75 |
arranged, transformed, or otherwise modified in a manner requiring
76 |
permission under the Copyright and Similar Rights held by the
77 |
Licensor. For purposes of this Public License, where the Licensed
78 |
Material is a musical work, performance, or sound recording,
79 |
Adapted Material is always produced where the Licensed Material is
80 |
synched in timed relation with a moving image.
81 |
82 |
b. Adapter's License means the license You apply to Your Copyright
83 |
and Similar Rights in Your contributions to Adapted Material in
84 |
accordance with the terms and conditions of this Public License.
85 |
86 |
c. Copyright and Similar Rights means copyright and/or similar rights
87 |
closely related to copyright including, without limitation,
88 |
performance, broadcast, sound recording, and Sui Generis Database
89 |
Rights, without regard to how the rights are labeled or
90 |
categorized. For purposes of this Public License, the rights
91 |
specified in Section 2(b)(1)-(2) are not Copyright and Similar
92 |
93 |
d. Effective Technological Measures means those measures that, in the
94 |
absence of proper authority, may not be circumvented under laws
95 |
fulfilling obligations under Article 11 of the WIPO Copyright
96 |
Treaty adopted on December 20, 1996, and/or similar international
97 |
98 |
99 |
e. Exceptions and Limitations means fair use, fair dealing, and/or
100 |
any other exception or limitation to Copyright and Similar Rights
101 |
that applies to Your use of the Licensed Material.
102 |
103 |
f. Licensed Material means the artistic or literary work, database,
104 |
or other material to which the Licensor applied this Public
105 |
106 |
107 |
g. Licensed Rights means the rights granted to You subject to the
108 |
terms and conditions of this Public License, which are limited to
109 |
all Copyright and Similar Rights that apply to Your use of the
110 |
Licensed Material and that the Licensor has authority to license.
111 |
112 |
h. Licensor means the individual(s) or entity(ies) granting rights
113 |
under this Public License.
114 |
115 |
i. NonCommercial means not primarily intended for or directed towards
116 |
commercial advantage or monetary compensation. For purposes of
117 |
this Public License, the exchange of the Licensed Material for
118 |
other material subject to Copyright and Similar Rights by digital
119 |
file-sharing or similar means is NonCommercial provided there is
120 |
no payment of monetary compensation in connection with the
121 |
122 |
123 |
j. Share means to provide material to the public by any means or
124 |
process that requires permission under the Licensed Rights, such
125 |
as reproduction, public display, public performance, distribution,
126 |
dissemination, communication, or importation, and to make material
127 |
available to the public including in ways that members of the
128 |
public may access the material from a place and at a time
129 |
individually chosen by them.
130 |
131 |
k. Sui Generis Database Rights means rights other than copyright
132 |
resulting from Directive 96/9/EC of the European Parliament and of
133 |
the Council of 11 March 1996 on the legal protection of databases,
134 |
as amended and/or succeeded, as well as other essentially
135 |
equivalent rights anywhere in the world.
136 |
137 |
l. You means the individual or entity exercising the Licensed Rights
138 |
under this Public License. Your has a corresponding meaning.
139 |
140 |
Section 2 -- Scope.
141 |
142 |
a. License grant.
143 |
144 |
1. Subject to the terms and conditions of this Public License,
145 |
the Licensor hereby grants You a worldwide, royalty-free,
146 |
non-sublicensable, non-exclusive, irrevocable license to
147 |
exercise the Licensed Rights in the Licensed Material to:
148 |
149 |
a. reproduce and Share the Licensed Material, in whole or
150 |
in part, for NonCommercial purposes only; and
151 |
152 |
b. produce, reproduce, and Share Adapted Material for
153 |
NonCommercial purposes only.
154 |
155 |
2. Exceptions and Limitations. For the avoidance of doubt, where
156 |
Exceptions and Limitations apply to Your use, this Public
157 |
License does not apply, and You do not need to comply with
158 |
its terms and conditions.
159 |
160 |
3. Term. The term of this Public License is specified in Section
161 |
162 |
163 |
4. Media and formats; technical modifications allowed. The
164 |
Licensor authorizes You to exercise the Licensed Rights in
165 |
all media and formats whether now known or hereafter created,
166 |
and to make technical modifications necessary to do so. The
167 |
Licensor waives and/or agrees not to assert any right or
168 |
authority to forbid You from making technical modifications
169 |
necessary to exercise the Licensed Rights, including
170 |
technical modifications necessary to circumvent Effective
171 |
Technological Measures. For purposes of this Public License,
172 |
simply making modifications authorized by this Section 2(a)
173 |
(4) never produces Adapted Material.
174 |
175 |
5. Downstream recipients.
176 |
177 |
a. Offer from the Licensor -- Licensed Material. Every
178 |
recipient of the Licensed Material automatically
179 |
receives an offer from the Licensor to exercise the
180 |
Licensed Rights under the terms and conditions of this
181 |
Public License.
182 |
183 |
b. No downstream restrictions. You may not offer or impose
184 |
any additional or different terms or conditions on, or
185 |
apply any Effective Technological Measures to, the
186 |
Licensed Material if doing so restricts exercise of the
187 |
Licensed Rights by any recipient of the Licensed
188 |
189 |
190 |
6. No endorsement. Nothing in this Public License constitutes or
191 |
may be construed as permission to assert or imply that You
192 |
are, or that Your use of the Licensed Material is, connected
193 |
with, or sponsored, endorsed, or granted official status by,
194 |
the Licensor or others designated to receive attribution as
195 |
provided in Section 3(a)(1)(A)(i).
196 |
197 |
b. Other rights.
198 |
199 |
1. Moral rights, such as the right of integrity, are not
200 |
licensed under this Public License, nor are publicity,
201 |
privacy, and/or other similar personality rights; however, to
202 |
the extent possible, the Licensor waives and/or agrees not to
203 |
assert any such rights held by the Licensor to the limited
204 |
extent necessary to allow You to exercise the Licensed
205 |
Rights, but not otherwise.
206 |
207 |
2. Patent and trademark rights are not licensed under this
208 |
Public License.
209 |
210 |
3. To the extent possible, the Licensor waives any right to
211 |
collect royalties from You for the exercise of the Licensed
212 |
Rights, whether directly or through a collecting society
213 |
under any voluntary or waivable statutory or compulsory
214 |
licensing scheme. In all other cases the Licensor expressly
215 |
reserves any right to collect such royalties, including when
216 |
the Licensed Material is used other than for NonCommercial
217 |
218 |
219 |
Section 3 -- License Conditions.
220 |
221 |
Your exercise of the Licensed Rights is expressly made subject to the
222 |
following conditions.
223 |
224 |
a. Attribution.
225 |
226 |
1. If You Share the Licensed Material (including in modified
227 |
form), You must:
228 |
229 |
a. retain the following if it is supplied by the Licensor
230 |
with the Licensed Material:
231 |
232 |
i. identification of the creator(s) of the Licensed
233 |
Material and any others designated to receive
234 |
attribution, in any reasonable manner requested by
235 |
the Licensor (including by pseudonym if
236 |
237 |
238 |
ii. a copyright notice;
239 |
240 |
iii. a notice that refers to this Public License;
241 |
242 |
iv. a notice that refers to the disclaimer of
243 |
244 |
245 |
v. a URI or hyperlink to the Licensed Material to the
246 |
extent reasonably practicable;
247 |
248 |
b. indicate if You modified the Licensed Material and
249 |
retain an indication of any previous modifications; and
250 |
251 |
c. indicate the Licensed Material is licensed under this
252 |
Public License, and include the text of, or the URI or
253 |
hyperlink to, this Public License.
254 |
255 |
2. You may satisfy the conditions in Section 3(a)(1) in any
256 |
reasonable manner based on the medium, means, and context in
257 |
which You Share the Licensed Material. For example, it may be
258 |
reasonable to satisfy the conditions by providing a URI or
259 |
hyperlink to a resource that includes the required
260 |
261 |
262 |
3. If requested by the Licensor, You must remove any of the
263 |
information required by Section 3(a)(1)(A) to the extent
264 |
reasonably practicable.
265 |
266 |
4. If You Share Adapted Material You produce, the Adapter's
267 |
License You apply must not prevent recipients of the Adapted
268 |
Material from complying with this Public License.
269 |
270 |
Section 4 -- Sui Generis Database Rights.
271 |
272 |
Where the Licensed Rights include Sui Generis Database Rights that
273 |
apply to Your use of the Licensed Material:
274 |
275 |
a. for the avoidance of doubt, Section 2(a)(1) grants You the right
276 |
to extract, reuse, reproduce, and Share all or a substantial
277 |
portion of the contents of the database for NonCommercial purposes
278 |
279 |
280 |
b. if You include all or a substantial portion of the database
281 |
contents in a database in which You have Sui Generis Database
282 |
Rights, then the database in which You have Sui Generis Database
283 |
Rights (but not its individual contents) is Adapted Material; and
284 |
285 |
c. You must comply with the conditions in Section 3(a) if You Share
286 |
all or a substantial portion of the contents of the database.
287 |
288 |
For the avoidance of doubt, this Section 4 supplements and does not
289 |
replace Your obligations under this Public License where the Licensed
290 |
Rights include other Copyright and Similar Rights.
291 |
292 |
Section 5 -- Disclaimer of Warranties and Limitation of Liability.
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
c. The disclaimer of warranties and limitation of liability provided
316 |
above shall be interpreted in a manner that, to the extent
317 |
possible, most closely approximates an absolute disclaimer and
318 |
waiver of all liability.
319 |
320 |
Section 6 -- Term and Termination.
321 |
322 |
a. This Public License applies for the term of the Copyright and
323 |
Similar Rights licensed here. However, if You fail to comply with
324 |
this Public License, then Your rights under this Public License
325 |
terminate automatically.
326 |
327 |
b. Where Your right to use the Licensed Material has terminated under
328 |
Section 6(a), it reinstates:
329 |
330 |
1. automatically as of the date the violation is cured, provided
331 |
it is cured within 30 days of Your discovery of the
332 |
violation; or
333 |
334 |
2. upon express reinstatement by the Licensor.
335 |
336 |
For the avoidance of doubt, this Section 6(b) does not affect any
337 |
right the Licensor may have to seek remedies for Your violations
338 |
of this Public License.
339 |
340 |
c. For the avoidance of doubt, the Licensor may also offer the
341 |
Licensed Material under separate terms or conditions or stop
342 |
distributing the Licensed Material at any time; however, doing so
343 |
will not terminate this Public License.
344 |
345 |
d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
346 |
347 |
348 |
Section 7 -- Other Terms and Conditions.
349 |
350 |
a. The Licensor shall not be bound by any additional or different
351 |
terms or conditions communicated by You unless expressly agreed.
352 |
353 |
b. Any arrangements, understandings, or agreements regarding the
354 |
Licensed Material not stated herein are separate from and
355 |
independent of the terms and conditions of this Public License.
356 |
357 |
Section 8 -- Interpretation.
358 |
359 |
a. For the avoidance of doubt, this Public License does not, and
360 |
shall not be interpreted to, reduce, limit, restrict, or impose
361 |
conditions on any use of the Licensed Material that could lawfully
362 |
be made without permission under this Public License.
363 |
364 |
b. To the extent possible, if any provision of this Public License is
365 |
deemed unenforceable, it shall be automatically reformed to the
366 |
minimum extent necessary to make it enforceable. If the provision
367 |
cannot be reformed, it shall be severed from this Public License
368 |
without affecting the enforceability of the remaining terms and
369 |
370 |
371 |
c. No term or condition of this Public License will be waived and no
372 |
failure to comply consented to unless expressly agreed to by the
373 |
374 |
375 |
d. Nothing in this Public License constitutes or may be interpreted
376 |
as a limitation upon, or waiver of, any privileges and immunities
377 |
that apply to the Licensor or You, including from the legal
378 |
processes of any jurisdiction or authority.
379 |
380 |
381 |
382 |
Creative Commons is not a party to its public
383 |
licenses. Notwithstanding, Creative Commons may elect to apply one of
384 |
its public licenses to material it publishes and in those instances
385 |
will be considered the “Licensor.” The text of the Creative Commons
386 |
public licenses is dedicated to the public domain under the CC0 Public
387 |
Domain Dedication. Except for the limited purpose of indicating that
388 |
material is shared under a Creative Commons public license or as
389 |
otherwise permitted by the Creative Commons policies published at
390 |
+, Creative Commons does not authorize the
391 |
use of the trademark "Creative Commons" or any other trademark or logo
392 |
of Creative Commons without its prior written consent including,
393 |
without limitation, in connection with any unauthorized modifications
394 |
to any of its public licenses or any other arrangements,
395 |
understandings, or agreements concerning use of licensed material. For
396 |
the avoidance of doubt, this paragraph does not form part of the
397 |
public licenses.
398 |
399 |
Creative Commons may be contacted at
Check out the configuration reference at
11 |
12 |
13 |
Check out the configuration reference at
14 |
15 |
# [OVSeg] Open-Vocabulary Semantic Segmentation with Mask-adapted CLIP
16 |
17 |
<img src="resources/pytorch-logo-dark.png" width="10%">
18 |
19 |
This is the official PyTorch implementation of our paper: <br>
20 |
**Open-Vocabulary Semantic Segmentation with Mask-adapted CLIP**<br>
21 |
[Feng Liang](, [Bichen Wu](, [Xiaoliang Dai](, [Kunpeng Li](, [Yinan Zhao](, [Hang Zhang](, [Peizhao Zhang](, [Peter Vajda](, [Diana Marculescu](
22 |
23 |
[[arXiv](] [[Project](]
24 |
25 |
<p align="center">
26 |
<img src="resources/ovseg.gif" width="100%">
27 |
28 |
29 |
30 |
## Installation
31 |
32 |
Please see [installation guide](./
33 |
34 |
## Data Preparation
35 |
36 |
Please see [datasets preparation](./datasets/
37 |
38 |
## Getting started
39 |
40 |
Please see [getting started instruction](./
41 |
42 |
43 |
44 |
Shield: [![CC BY-NC 4.0][cc-by-nc-shield]][cc-by-nc]
45 |
46 |
The majority of OVSeg is licensed under a
47 |
[Creative Commons Attribution-NonCommercial 4.0 International License](LICENSE).
48 |
49 |
[![CC BY-NC 4.0][cc-by-nc-image]][cc-by-nc]
50 |
51 |
52 |
53 |
54 |
55 |
However portions of the project are under separate license terms: CLIP and ZSSEG are licensed under the [MIT license](; MaskFormer is licensed under the [CC-BY-NC](; openclip is licensed under the license at [its repo](
56 |
57 |
58 |
## Citing OVSeg :pray:
59 |
60 |
If you use OVSeg in your research or wish to refer to the baseline results published in the paper, please use the following BibTeX entry.
61 |
62 |
63 |
64 |
title={Open-Vocabulary Semantic Segmentation with Mask-adapted CLIP},
65 |
author={Liang, Feng and Wu, Bichen and Dai, Xiaoliang and Li, Kunpeng and Zhao, Yinan and Zhang, Hang and Zhang, Peizhao and Vajda, Peter and Marculescu, Diana},
66 |
journal={arXiv preprint arXiv:2210.04150},
67 |
68 |
69 |
@@ -0,0 +1,100 @@
1 |
2 |
3 |
4 |
5 |
NAME: "D2SwinTransformer"
6 |
7 |
8 |
DEPTHS: [2, 2, 18, 2]
9 |
NUM_HEADS: [4, 8, 16, 32]
10 |
11 |
APE: False
12 |
13 |
14 |
15 |
WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
16 |
PIXEL_MEAN: [123.675, 116.280, 103.530]
17 |
PIXEL_STD: [58.395, 57.120, 57.375]
18 |
19 |
NAME: "OpenVocabMaskFormerHead"
20 |
IN_FEATURES: ["res2", "res3", "res4", "res5"]
21 |
22 |
NUM_CLASSES: 171 # number of categories in training set
23 |
24 |
25 |
COMMON_STRIDE: 4 # not used, hard-coded
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
MASK_FILL: "mean"
48 |
49 |
MASK_THR: 0.4 # choose the foreground objects
50 |
MASK_MATTING: False # use soft background, default not used
51 |
52 |
MASK_PROMPT_FWD: True # use mask prompt during forward
53 |
REGION_RESIZED: True # resize to the input of clip, e.g., 224
54 |
CLIP_ENSEMBLE: True # use ensemble of two classification branches
55 |
56 |
57 |
TRAIN: ("coco_2017_train_stuff_sem_seg",)
58 |
TEST: ("ade20k_sem_seg_val",)
59 |
60 |
61 |
BASE_LR: 0.00006
62 |
MAX_ITER: 120000
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
CLIP_TYPE: "full_model"
74 |
75 |
76 |
77 |
MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
78 |
79 |
80 |
81 |
82 |
83 |
84 |
TYPE: "absolute"
85 |
SIZE: (640, 640)
86 |
87 |
88 |
SIZE_DIVISIBILITY: 640 # used in dataset mapper
89 |
90 |
91 |
92 |
93 |
94 |
MIN_SIZES: [256, 384, 512, 640, 768, 896]
95 |
MAX_SIZE: 3584
96 |
FLIP: True
97 |
98 |
99 |
100 |
@@ -0,0 +1,99 @@
1 |
2 |
3 |
4 |
5 |
NAME: "D2SwinTransformer"
6 |
7 |
8 |
DEPTHS: [2, 2, 18, 2]
9 |
NUM_HEADS: [4, 8, 16, 32]
10 |
11 |
APE: False
12 |
13 |
14 |
15 |
WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
16 |
PIXEL_MEAN: [123.675, 116.280, 103.530]
17 |
PIXEL_STD: [58.395, 57.120, 57.375]
18 |
19 |
NAME: "OpenVocabMaskFormerHead"
20 |
IN_FEATURES: ["res2", "res3", "res4", "res5"]
21 |
22 |
NUM_CLASSES: 171 # number of categories in training set
23 |
24 |
25 |
COMMON_STRIDE: 4 # not used, hard-coded
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
MASK_FILL: "mean"
48 |
49 |
MASK_THR: 0.35 # choose the foreground objects
50 |
MASK_MATTING: False # use soft background, default not used
51 |
52 |
MASK_PROMPT_FWD: True # use mask prompt during forward
53 |
REGION_RESIZED: True # resize to the input of clip, e.g., 224
54 |
CLIP_ENSEMBLE: True # use ensemble of two classification branches
55 |
56 |
57 |
TRAIN: ("coco_2017_train_stuff_sem_seg",)
58 |
TEST: ("ade20k_sem_seg_val",)
59 |
60 |
61 |
BASE_LR: 0.00006
62 |
MAX_ITER: 120000
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
CLIP_TYPE: "full_model"
73 |
74 |
75 |
76 |
MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
77 |
78 |
79 |
80 |
81 |
82 |
83 |
TYPE: "absolute"
84 |
SIZE: (640, 640)
85 |
86 |
87 |
SIZE_DIVISIBILITY: 640 # used in dataset mapper
88 |
89 |
90 |
91 |
92 |
93 |
MIN_SIZES: [256, 384, 512, 640, 768, 896]
94 |
MAX_SIZE: 3584
95 |
FLIP: True
96 |
97 |
98 |
99 |
@@ -0,0 +1,122 @@
1 |
## Prepare Datasets for OVSeg
2 |
3 |
This doc is a modification/extension of [MaskFormer]( following [Detectron2 fromat](
4 |
5 |
A dataset can be used by accessing [DatasetCatalog](
6 |
for its data, or [MetadataCatalog]( for its metadata (class names, etc).
7 |
This document explains how to setup the builtin datasets so they can be used by the above APIs.
8 |
[Use Custom Datasets]( gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`,
9 |
and how to add new datasets to them.
10 |
11 |
OVSeg has builtin support for a few datasets.
12 |
The datasets are assumed to exist in a directory specified by the environment variable
13 |
14 |
Under this directory, detectron2 will look for datasets in the structure described below, if needed.
15 |
16 |
17 |
coco/ # COCOStuff-171
18 |
ADEChallengeData2016/ # ADE20K-150
19 |
ADE20K_2021_17_01/ # ADE20K-847
20 |
21 |
22 |
VOC2010/ # PASCALContext-59, PASCALContext-459
23 |
24 |
25 |
You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
26 |
If left unset, the default is `./datasets` relative to your current working directory.
27 |
28 |
Without specific notifications, our model is trained on COCOStuff-171 and evlauted on ADE20K-150, ADE20K-847, PASCALVOC-20, PASCALContext-59 and PASCALContext-459.
29 |
30 |
| dataset | split | # images | # categories |
31 |
32 |
| COCO Stuff | train2017 | 118K | 171 |
33 |
| ADE20K | val | 2K | 150/847 |
34 |
| Pascal VOC | val | 1.5K | 20 |
35 |
| Pascal Context | val | 5K | 59/459 |
36 |
37 |
38 |
### Expected dataset structure for [COCO Stuff](
39 |
40 |
41 |
train2017/ #
42 |
annotations/ #
43 |
44 |
+ #
45 |
46 |
# below are generated
47 |
48 |
49 |
50 |
51 |
The directory `stuffthingmaps_detectron2` is generated by running `python datasets/`.
52 |
53 |
54 |
55 |
### Expected dataset structure for [ADE20k Scene Parsing (ADE20K-150)](
56 |
57 |
58 |
59 |
60 |
61 |
# below are generated
62 |
63 |
64 |
The directory `annotations_detectron2` is generated by running `python datasets/`.
65 |
66 |
67 |
### Expected dataset structure for [ADE20k-Full (ADE20K-847)](
68 |
69 |
70 |
71 |
72 |
73 |
# below are generated
74 |
75 |
76 |
77 |
The directories `images_detectron2` and `annotations_detectron2` are generated by running `python datasets/`.
78 |
79 |
### Expected dataset structure for [Pascal VOC 2012 (PASCALVOC-20)](
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
SegmentationClassAug/ #
88 |
# below are generated
89 |
90 |
91 |
92 |
93 |
It starts with a tar file `VOCtrainval_11-May-2012.tar`.
94 |
95 |
We use SBD augmentated training data as `SegmentationClassAug` following [Deeplab](
96 |
97 |
The directories `images_detectron2` and `annotations_detectron2` are generated by running `python datasets/`.
98 |
99 |
100 |
### Expected dataset structure for [Pascal Context](
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
# below are from
110 |
111 |
112 |
59_labels.txt #
113 |
pascalcontext_val.txt #
114 |
# below are generated
115 |
116 |
117 |
118 |
119 |
It starts with a tar file `VOCtrainval_03-May-2010.tar`. You may want to download the 5K validation set [here](
120 |
121 |
The directory `annotations_detectron2` is generated by running `python datasets/`.
122 |
@@ -0,0 +1,1011 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
import os
5 |
import pickle as pkl
6 |
from pathlib import Path
7 |
8 |
import cv2
9 |
import numpy as np
10 |
import tqdm
11 |
from PIL import Image
12 |
13 |
14 |
{"name": "wall", "id": 2978, "trainId": 0},
15 |
{"name": "building, edifice", "id": 312, "trainId": 1},
16 |
{"name": "sky", "id": 2420, "trainId": 2},
17 |
{"name": "tree", "id": 2855, "trainId": 3},
18 |
{"name": "road, route", "id": 2131, "trainId": 4},
19 |
{"name": "floor, flooring", "id": 976, "trainId": 5},
20 |
{"name": "ceiling", "id": 447, "trainId": 6},
21 |
{"name": "bed", "id": 165, "trainId": 7},
22 |
{"name": "sidewalk, pavement", "id": 2377, "trainId": 8},
23 |
{"name": "earth, ground", "id": 838, "trainId": 9},
24 |
{"name": "cabinet", "id": 350, "trainId": 10},
25 |
{"name": "person, individual, someone, somebody, mortal, soul", "id": 1831, "trainId": 11},
26 |
{"name": "grass", "id": 1125, "trainId": 12},
27 |
{"name": "windowpane, window", "id": 3055, "trainId": 13},
28 |
{"name": "car, auto, automobile, machine, motorcar", "id": 401, "trainId": 14},
29 |
{"name": "mountain, mount", "id": 1610, "trainId": 15},
30 |
{"name": "plant, flora, plant life", "id": 1910, "trainId": 16},
31 |
{"name": "table", "id": 2684, "trainId": 17},
32 |
{"name": "chair", "id": 471, "trainId": 18},
33 |
{"name": "curtain, drape, drapery, mantle, pall", "id": 687, "trainId": 19},
34 |
{"name": "door", "id": 774, "trainId": 20},
35 |
{"name": "sofa, couch, lounge", "id": 2473, "trainId": 21},
36 |
{"name": "sea", "id": 2264, "trainId": 22},
37 |
{"name": "painting, picture", "id": 1735, "trainId": 23},
38 |
{"name": "water", "id": 2994, "trainId": 24},
39 |
{"name": "mirror", "id": 1564, "trainId": 25},
40 |
{"name": "house", "id": 1276, "trainId": 26},
41 |
{"name": "rug, carpet, carpeting", "id": 2178, "trainId": 27},
42 |
{"name": "shelf", "id": 2329, "trainId": 28},
43 |
{"name": "armchair", "id": 57, "trainId": 29},
44 |
{"name": "fence, fencing", "id": 907, "trainId": 30},
45 |
{"name": "field", "id": 913, "trainId": 31},
46 |
{"name": "lamp", "id": 1395, "trainId": 32},
47 |
{"name": "rock, stone", "id": 2138, "trainId": 33},
48 |
{"name": "seat", "id": 2272, "trainId": 34},
49 |
{"name": "river", "id": 2128, "trainId": 35},
50 |
{"name": "desk", "id": 724, "trainId": 36},
51 |
{"name": "bathtub, bathing tub, bath, tub", "id": 155, "trainId": 37},
52 |
{"name": "railing, rail", "id": 2053, "trainId": 38},
53 |
{"name": "signboard, sign", "id": 2380, "trainId": 39},
54 |
{"name": "cushion", "id": 689, "trainId": 40},
55 |
{"name": "path", "id": 1788, "trainId": 41},
56 |
{"name": "work surface", "id": 3087, "trainId": 42},
57 |
{"name": "stairs, steps", "id": 2530, "trainId": 43},
58 |
{"name": "column, pillar", "id": 581, "trainId": 44},
59 |
{"name": "sink", "id": 2388, "trainId": 45},
60 |
{"name": "wardrobe, closet, press", "id": 2985, "trainId": 46},
61 |
{"name": "snow", "id": 2454, "trainId": 47},
62 |
{"name": "refrigerator, icebox", "id": 2096, "trainId": 48},
63 |
{"name": "base, pedestal, stand", "id": 137, "trainId": 49},
64 |
{"name": "bridge, span", "id": 294, "trainId": 50},
65 |
{"name": "blind, screen", "id": 212, "trainId": 51},
66 |
{"name": "runway", "id": 2185, "trainId": 52},
67 |
{"name": "cliff, drop, drop-off", "id": 524, "trainId": 53},
68 |
{"name": "sand", "id": 2212, "trainId": 54},
69 |
{"name": "fireplace, hearth, open fireplace", "id": 943, "trainId": 55},
70 |
{"name": "pillow", "id": 1869, "trainId": 56},
71 |
{"name": "screen door, screen", "id": 2251, "trainId": 57},
72 |
{"name": "toilet, can, commode, crapper, pot, potty, stool, throne", "id": 2793, "trainId": 58},
73 |
{"name": "skyscraper", "id": 2423, "trainId": 59},
74 |
{"name": "grandstand, covered stand", "id": 1121, "trainId": 60},
75 |
{"name": "box", "id": 266, "trainId": 61},
76 |
{"name": "pool table, billiard table, snooker table", "id": 1948, "trainId": 62},
77 |
{"name": "palm, palm tree", "id": 1744, "trainId": 63},
78 |
{"name": "double door", "id": 783, "trainId": 64},
79 |
{"name": "coffee table, cocktail table", "id": 571, "trainId": 65},
80 |
{"name": "counter", "id": 627, "trainId": 66},
81 |
{"name": "countertop", "id": 629, "trainId": 67},
82 |
{"name": "chest of drawers, chest, bureau, dresser", "id": 491, "trainId": 68},
83 |
{"name": "kitchen island", "id": 1374, "trainId": 69},
84 |
{"name": "boat", "id": 223, "trainId": 70},
85 |
{"name": "waterfall, falls", "id": 3016, "trainId": 71},
86 |
87 |
"name": "stove, kitchen stove, range, kitchen range, cooking stove",
88 |
"id": 2598,
89 |
"trainId": 72,
90 |
91 |
{"name": "flower", "id": 978, "trainId": 73},
92 |
{"name": "bookcase", "id": 239, "trainId": 74},
93 |
{"name": "controls", "id": 608, "trainId": 75},
94 |
{"name": "book", "id": 236, "trainId": 76},
95 |
{"name": "stairway, staircase", "id": 2531, "trainId": 77},
96 |
{"name": "streetlight, street lamp", "id": 2616, "trainId": 78},
97 |
98 |
"name": "computer, computing machine, computing device, data processor, electronic computer, information processing system",
99 |
"id": 591,
100 |
"trainId": 79,
101 |
102 |
103 |
"name": "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, motorcoach, omnibus, passenger vehicle",
104 |
"id": 327,
105 |
"trainId": 80,
106 |
107 |
{"name": "swivel chair", "id": 2679, "trainId": 81},
108 |
{"name": "light, light source", "id": 1451, "trainId": 82},
109 |
{"name": "bench", "id": 181, "trainId": 83},
110 |
{"name": "case, display case, showcase, vitrine", "id": 420, "trainId": 84},
111 |
{"name": "towel", "id": 2821, "trainId": 85},
112 |
{"name": "fountain", "id": 1023, "trainId": 86},
113 |
{"name": "embankment", "id": 855, "trainId": 87},
114 |
115 |
"name": "television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box",
116 |
"id": 2733,
117 |
"trainId": 88,
118 |
119 |
{"name": "van", "id": 2928, "trainId": 89},
120 |
{"name": "hill", "id": 1240, "trainId": 90},
121 |
{"name": "awning, sunshade, sunblind", "id": 77, "trainId": 91},
122 |
{"name": "poster, posting, placard, notice, bill, card", "id": 1969, "trainId": 92},
123 |
{"name": "truck, motortruck", "id": 2880, "trainId": 93},
124 |
{"name": "airplane, aeroplane, plane", "id": 14, "trainId": 94},
125 |
{"name": "pole", "id": 1936, "trainId": 95},
126 |
{"name": "tower", "id": 2828, "trainId": 96},
127 |
{"name": "court", "id": 631, "trainId": 97},
128 |
{"name": "ball", "id": 103, "trainId": 98},
129 |
130 |
"name": "aircraft carrier, carrier, flattop, attack aircraft carrier",
131 |
"id": 3144,
132 |
"trainId": 99,
133 |
134 |
{"name": "buffet, counter, sideboard", "id": 308, "trainId": 100},
135 |
{"name": "hovel, hut, hutch, shack, shanty", "id": 1282, "trainId": 101},
136 |
{"name": "apparel, wearing apparel, dress, clothes", "id": 38, "trainId": 102},
137 |
{"name": "minibike, motorbike", "id": 1563, "trainId": 103},
138 |
{"name": "animal, animate being, beast, brute, creature, fauna", "id": 29, "trainId": 104},
139 |
{"name": "chandelier, pendant, pendent", "id": 480, "trainId": 105},
140 |
{"name": "step, stair", "id": 2569, "trainId": 106},
141 |
{"name": "booth, cubicle, stall, kiosk", "id": 247, "trainId": 107},
142 |
{"name": "bicycle, bike, wheel, cycle", "id": 187, "trainId": 108},
143 |
{"name": "doorframe, doorcase", "id": 778, "trainId": 109},
144 |
{"name": "sconce", "id": 2243, "trainId": 110},
145 |
{"name": "pond", "id": 1941, "trainId": 111},
146 |
{"name": "trade name, brand name, brand, marque", "id": 2833, "trainId": 112},
147 |
{"name": "bannister, banister, balustrade, balusters, handrail", "id": 120, "trainId": 113},
148 |
{"name": "bag", "id": 95, "trainId": 114},
149 |
{"name": "traffic light, traffic signal, stoplight", "id": 2836, "trainId": 115},
150 |
{"name": "gazebo", "id": 1087, "trainId": 116},
151 |
{"name": "escalator, moving staircase, moving stairway", "id": 868, "trainId": 117},
152 |
{"name": "land, ground, soil", "id": 1401, "trainId": 118},
153 |
{"name": "board, plank", "id": 220, "trainId": 119},
154 |
{"name": "arcade machine", "id": 47, "trainId": 120},
155 |
{"name": "eiderdown, duvet, continental quilt", "id": 843, "trainId": 121},
156 |
{"name": "bar", "id": 123, "trainId": 122},
157 |
{"name": "stall, stand, sales booth", "id": 2537, "trainId": 123},
158 |
{"name": "playground", "id": 1927, "trainId": 124},
159 |
{"name": "ship", "id": 2337, "trainId": 125},
160 |
{"name": "ottoman, pouf, pouffe, puff, hassock", "id": 1702, "trainId": 126},
161 |
162 |
"name": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
163 |
"id": 64,
164 |
"trainId": 127,
165 |
166 |
{"name": "bottle", "id": 249, "trainId": 128},
167 |
{"name": "cradle", "id": 642, "trainId": 129},
168 |
{"name": "pot, flowerpot", "id": 1981, "trainId": 130},
169 |
170 |
"name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
171 |
"id": 609,
172 |
"trainId": 131,
173 |
174 |
{"name": "train, railroad train", "id": 2840, "trainId": 132},
175 |
{"name": "stool", "id": 2586, "trainId": 133},
176 |
{"name": "lake", "id": 1393, "trainId": 134},
177 |
{"name": "tank, storage tank", "id": 2704, "trainId": 135},
178 |
{"name": "ice, water ice", "id": 1304, "trainId": 136},
179 |
{"name": "basket, handbasket", "id": 146, "trainId": 137},
180 |
{"name": "manhole", "id": 1494, "trainId": 138},
181 |
{"name": "tent, collapsible shelter", "id": 2739, "trainId": 139},
182 |
{"name": "canopy", "id": 389, "trainId": 140},
183 |
{"name": "microwave, microwave oven", "id": 1551, "trainId": 141},
184 |
{"name": "barrel, cask", "id": 131, "trainId": 142},
185 |
{"name": "dirt track", "id": 738, "trainId": 143},
186 |
{"name": "beam", "id": 161, "trainId": 144},
187 |
{"name": "dishwasher, dish washer, dishwashing machine", "id": 747, "trainId": 145},
188 |
{"name": "plate", "id": 1919, "trainId": 146},
189 |
{"name": "screen, crt screen", "id": 3109, "trainId": 147},
190 |
{"name": "ruins", "id": 2179, "trainId": 148},
191 |
{"name": "washer, automatic washer, washing machine", "id": 2989, "trainId": 149},
192 |
{"name": "blanket, cover", "id": 206, "trainId": 150},
193 |
{"name": "plaything, toy", "id": 1930, "trainId": 151},
194 |
{"name": "food, solid food", "id": 1002, "trainId": 152},
195 |
{"name": "screen, silver screen, projection screen", "id": 2254, "trainId": 153},
196 |
{"name": "oven", "id": 1708, "trainId": 154},
197 |
{"name": "stage", "id": 2526, "trainId": 155},
198 |
{"name": "beacon, lighthouse, beacon light, pharos", "id": 160, "trainId": 156},
199 |
{"name": "umbrella", "id": 2901, "trainId": 157},
200 |
{"name": "sculpture", "id": 2262, "trainId": 158},
201 |
{"name": "aqueduct", "id": 44, "trainId": 159},
202 |
{"name": "container", "id": 597, "trainId": 160},
203 |
{"name": "scaffolding, staging", "id": 2235, "trainId": 161},
204 |
{"name": "hood, exhaust hood", "id": 1260, "trainId": 162},
205 |
{"name": "curb, curbing, kerb", "id": 682, "trainId": 163},
206 |
{"name": "roller coaster", "id": 2151, "trainId": 164},
207 |
{"name": "horse, equus caballus", "id": 3107, "trainId": 165},
208 |
{"name": "catwalk", "id": 432, "trainId": 166},
209 |
{"name": "glass, drinking glass", "id": 1098, "trainId": 167},
210 |
{"name": "vase", "id": 2932, "trainId": 168},
211 |
{"name": "central reservation", "id": 461, "trainId": 169},
212 |
{"name": "carousel", "id": 410, "trainId": 170},
213 |
{"name": "radiator", "id": 2046, "trainId": 171},
214 |
{"name": "closet", "id": 533, "trainId": 172},
215 |
{"name": "machine", "id": 1481, "trainId": 173},
216 |
{"name": "pier, wharf, wharfage, dock", "id": 1858, "trainId": 174},
217 |
{"name": "fan", "id": 894, "trainId": 175},
218 |
{"name": "inflatable bounce game", "id": 1322, "trainId": 176},
219 |
{"name": "pitch", "id": 1891, "trainId": 177},
220 |
{"name": "paper", "id": 1756, "trainId": 178},
221 |
{"name": "arcade, colonnade", "id": 49, "trainId": 179},
222 |
{"name": "hot tub", "id": 1272, "trainId": 180},
223 |
{"name": "helicopter", "id": 1229, "trainId": 181},
224 |
{"name": "tray", "id": 2850, "trainId": 182},
225 |
{"name": "partition, divider", "id": 1784, "trainId": 183},
226 |
{"name": "vineyard", "id": 2962, "trainId": 184},
227 |
{"name": "bowl", "id": 259, "trainId": 185},
228 |
{"name": "bullring", "id": 319, "trainId": 186},
229 |
{"name": "flag", "id": 954, "trainId": 187},
230 |
{"name": "pot", "id": 1974, "trainId": 188},
231 |
{"name": "footbridge, overcrossing, pedestrian bridge", "id": 1013, "trainId": 189},
232 |
{"name": "shower", "id": 2356, "trainId": 190},
233 |
{"name": "bag, traveling bag, travelling bag, grip, suitcase", "id": 97, "trainId": 191},
234 |
{"name": "bulletin board, notice board", "id": 318, "trainId": 192},
235 |
{"name": "confessional booth", "id": 592, "trainId": 193},
236 |
{"name": "trunk, tree trunk, bole", "id": 2885, "trainId": 194},
237 |
{"name": "forest", "id": 1017, "trainId": 195},
238 |
{"name": "elevator door", "id": 851, "trainId": 196},
239 |
{"name": "laptop, laptop computer", "id": 1407, "trainId": 197},
240 |
{"name": "instrument panel", "id": 1332, "trainId": 198},
241 |
{"name": "bucket, pail", "id": 303, "trainId": 199},
242 |
{"name": "tapestry, tapis", "id": 2714, "trainId": 200},
243 |
{"name": "platform", "id": 1924, "trainId": 201},
244 |
{"name": "jacket", "id": 1346, "trainId": 202},
245 |
{"name": "gate", "id": 1081, "trainId": 203},
246 |
{"name": "monitor, monitoring device", "id": 1583, "trainId": 204},
247 |
248 |
"name": "telephone booth, phone booth, call box, telephone box, telephone kiosk",
249 |
"id": 2727,
250 |
"trainId": 205,
251 |
252 |
{"name": "spotlight, spot", "id": 2509, "trainId": 206},
253 |
{"name": "ring", "id": 2123, "trainId": 207},
254 |
{"name": "control panel", "id": 602, "trainId": 208},
255 |
{"name": "blackboard, chalkboard", "id": 202, "trainId": 209},
256 |
{"name": "air conditioner, air conditioning", "id": 10, "trainId": 210},
257 |
{"name": "chest", "id": 490, "trainId": 211},
258 |
{"name": "clock", "id": 530, "trainId": 212},
259 |
{"name": "sand dune", "id": 2213, "trainId": 213},
260 |
{"name": "pipe, pipage, piping", "id": 1884, "trainId": 214},
261 |
{"name": "vault", "id": 2934, "trainId": 215},
262 |
{"name": "table football", "id": 2687, "trainId": 216},
263 |
{"name": "cannon", "id": 387, "trainId": 217},
264 |
{"name": "swimming pool, swimming bath, natatorium", "id": 2668, "trainId": 218},
265 |
{"name": "fluorescent, fluorescent fixture", "id": 982, "trainId": 219},
266 |
{"name": "statue", "id": 2547, "trainId": 220},
267 |
268 |
"name": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
269 |
"id": 1474,
270 |
"trainId": 221,
271 |
272 |
{"name": "exhibitor", "id": 877, "trainId": 222},
273 |
{"name": "ladder", "id": 1391, "trainId": 223},
274 |
{"name": "carport", "id": 414, "trainId": 224},
275 |
{"name": "dam", "id": 698, "trainId": 225},
276 |
{"name": "pulpit", "id": 2019, "trainId": 226},
277 |
{"name": "skylight, fanlight", "id": 2422, "trainId": 227},
278 |
{"name": "water tower", "id": 3010, "trainId": 228},
279 |
{"name": "grill, grille, grillwork", "id": 1139, "trainId": 229},
280 |
{"name": "display board", "id": 753, "trainId": 230},
281 |
{"name": "pane, pane of glass, window glass", "id": 1747, "trainId": 231},
282 |
{"name": "rubbish, trash, scrap", "id": 2175, "trainId": 232},
283 |
{"name": "ice rink", "id": 1301, "trainId": 233},
284 |
{"name": "fruit", "id": 1033, "trainId": 234},
285 |
{"name": "patio", "id": 1789, "trainId": 235},
286 |
{"name": "vending machine", "id": 2939, "trainId": 236},
287 |
{"name": "telephone, phone, telephone set", "id": 2730, "trainId": 237},
288 |
{"name": "net", "id": 1652, "trainId": 238},
289 |
290 |
"name": "backpack, back pack, knapsack, packsack, rucksack, haversack",
291 |
"id": 90,
292 |
"trainId": 239,
293 |
294 |
{"name": "jar", "id": 1349, "trainId": 240},
295 |
{"name": "track", "id": 2830, "trainId": 241},
296 |
{"name": "magazine", "id": 1485, "trainId": 242},
297 |
{"name": "shutter", "id": 2370, "trainId": 243},
298 |
{"name": "roof", "id": 2155, "trainId": 244},
299 |
{"name": "banner, streamer", "id": 118, "trainId": 245},
300 |
{"name": "landfill", "id": 1402, "trainId": 246},
301 |
{"name": "post", "id": 1957, "trainId": 247},
302 |
{"name": "altarpiece, reredos", "id": 3130, "trainId": 248},
303 |
{"name": "hat, chapeau, lid", "id": 1197, "trainId": 249},
304 |
{"name": "arch, archway", "id": 52, "trainId": 250},
305 |
{"name": "table game", "id": 2688, "trainId": 251},
306 |
{"name": "bag, handbag, pocketbook, purse", "id": 96, "trainId": 252},
307 |
{"name": "document, written document, papers", "id": 762, "trainId": 253},
308 |
{"name": "dome", "id": 772, "trainId": 254},
309 |
{"name": "pier", "id": 1857, "trainId": 255},
310 |
{"name": "shanties", "id": 2315, "trainId": 256},
311 |
{"name": "forecourt", "id": 1016, "trainId": 257},
312 |
{"name": "crane", "id": 643, "trainId": 258},
313 |
{"name": "dog, domestic dog, canis familiaris", "id": 3105, "trainId": 259},
314 |
{"name": "piano, pianoforte, forte-piano", "id": 1849, "trainId": 260},
315 |
{"name": "drawing", "id": 791, "trainId": 261},
316 |
{"name": "cabin", "id": 349, "trainId": 262},
317 |
318 |
"name": "ad, advertisement, advertizement, advertising, advertizing, advert",
319 |
"id": 6,
320 |
"trainId": 263,
321 |
322 |
{"name": "amphitheater, amphitheatre, coliseum", "id": 3114, "trainId": 264},
323 |
{"name": "monument", "id": 1587, "trainId": 265},
324 |
{"name": "henhouse", "id": 1233, "trainId": 266},
325 |
{"name": "cockpit", "id": 559, "trainId": 267},
326 |
{"name": "heater, warmer", "id": 1223, "trainId": 268},
327 |
{"name": "windmill, aerogenerator, wind generator", "id": 3049, "trainId": 269},
328 |
{"name": "pool", "id": 1943, "trainId": 270},
329 |
{"name": "elevator, lift", "id": 853, "trainId": 271},
330 |
{"name": "decoration, ornament, ornamentation", "id": 709, "trainId": 272},
331 |
{"name": "labyrinth", "id": 1390, "trainId": 273},
332 |
{"name": "text, textual matter", "id": 2748, "trainId": 274},
333 |
{"name": "printer", "id": 2007, "trainId": 275},
334 |
{"name": "mezzanine, first balcony", "id": 1546, "trainId": 276},
335 |
{"name": "mattress", "id": 1513, "trainId": 277},
336 |
{"name": "straw", "id": 2600, "trainId": 278},
337 |
{"name": "stalls", "id": 2538, "trainId": 279},
338 |
{"name": "patio, terrace", "id": 1790, "trainId": 280},
339 |
{"name": "billboard, hoarding", "id": 194, "trainId": 281},
340 |
{"name": "bus stop", "id": 326, "trainId": 282},
341 |
{"name": "trouser, pant", "id": 2877, "trainId": 283},
342 |
{"name": "console table, console", "id": 594, "trainId": 284},
343 |
{"name": "rack", "id": 2036, "trainId": 285},
344 |
{"name": "notebook", "id": 1662, "trainId": 286},
345 |
{"name": "shrine", "id": 2366, "trainId": 287},
346 |
{"name": "pantry", "id": 1754, "trainId": 288},
347 |
{"name": "cart", "id": 418, "trainId": 289},
348 |
{"name": "steam shovel", "id": 2553, "trainId": 290},
349 |
{"name": "porch", "id": 1951, "trainId": 291},
350 |
{"name": "postbox, mailbox, letter box", "id": 1963, "trainId": 292},
351 |
{"name": "figurine, statuette", "id": 918, "trainId": 293},
352 |
{"name": "recycling bin", "id": 2086, "trainId": 294},
353 |
{"name": "folding screen", "id": 997, "trainId": 295},
354 |
{"name": "telescope", "id": 2731, "trainId": 296},
355 |
{"name": "deck chair, beach chair", "id": 704, "trainId": 297},
356 |
{"name": "kennel", "id": 1365, "trainId": 298},
357 |
{"name": "coffee maker", "id": 569, "trainId": 299},
358 |
{"name": "altar, communion table, lord's table", "id": 3108, "trainId": 300},
359 |
{"name": "fish", "id": 948, "trainId": 301},
360 |
{"name": "easel", "id": 839, "trainId": 302},
361 |
{"name": "artificial golf green", "id": 63, "trainId": 303},
362 |
{"name": "iceberg", "id": 1305, "trainId": 304},
363 |
{"name": "candlestick, candle holder", "id": 378, "trainId": 305},
364 |
{"name": "shower stall, shower bath", "id": 2362, "trainId": 306},
365 |
{"name": "television stand", "id": 2734, "trainId": 307},
366 |
367 |
"name": "wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle",
368 |
"id": 2982,
369 |
"trainId": 308,
370 |
371 |
{"name": "skeleton", "id": 2398, "trainId": 309},
372 |
{"name": "grand piano, grand", "id": 1119, "trainId": 310},
373 |
{"name": "candy, confect", "id": 382, "trainId": 311},
374 |
{"name": "grille door", "id": 1141, "trainId": 312},
375 |
{"name": "pedestal, plinth, footstall", "id": 1805, "trainId": 313},
376 |
{"name": "jersey, t-shirt, tee shirt", "id": 3102, "trainId": 314},
377 |
{"name": "shoe", "id": 2341, "trainId": 315},
378 |
{"name": "gravestone, headstone, tombstone", "id": 1131, "trainId": 316},
379 |
{"name": "shanty", "id": 2316, "trainId": 317},
380 |
{"name": "structure", "id": 2626, "trainId": 318},
381 |
{"name": "rocking chair, rocker", "id": 3104, "trainId": 319},
382 |
{"name": "bird", "id": 198, "trainId": 320},
383 |
{"name": "place mat", "id": 1896, "trainId": 321},
384 |
{"name": "tomb", "id": 2800, "trainId": 322},
385 |
{"name": "big top", "id": 190, "trainId": 323},
386 |
{"name": "gas pump, gasoline pump, petrol pump, island dispenser", "id": 3131, "trainId": 324},
387 |
{"name": "lockers", "id": 1463, "trainId": 325},
388 |
{"name": "cage", "id": 357, "trainId": 326},
389 |
{"name": "finger", "id": 929, "trainId": 327},
390 |
{"name": "bleachers", "id": 209, "trainId": 328},
391 |
{"name": "ferris wheel", "id": 912, "trainId": 329},
392 |
{"name": "hairdresser chair", "id": 1164, "trainId": 330},
393 |
{"name": "mat", "id": 1509, "trainId": 331},
394 |
{"name": "stands", "id": 2539, "trainId": 332},
395 |
{"name": "aquarium, fish tank, marine museum", "id": 3116, "trainId": 333},
396 |
{"name": "streetcar, tram, tramcar, trolley, trolley car", "id": 2615, "trainId": 334},
397 |
{"name": "napkin, table napkin, serviette", "id": 1644, "trainId": 335},
398 |
{"name": "dummy", "id": 818, "trainId": 336},
399 |
{"name": "booklet, brochure, folder, leaflet, pamphlet", "id": 242, "trainId": 337},
400 |
{"name": "sand trap", "id": 2217, "trainId": 338},
401 |
{"name": "shop, store", "id": 2347, "trainId": 339},
402 |
{"name": "table cloth", "id": 2686, "trainId": 340},
403 |
{"name": "service station", "id": 2300, "trainId": 341},
404 |
{"name": "coffin", "id": 572, "trainId": 342},
405 |
{"name": "drawer", "id": 789, "trainId": 343},
406 |
{"name": "cages", "id": 358, "trainId": 344},
407 |
{"name": "slot machine, coin machine", "id": 2443, "trainId": 345},
408 |
{"name": "balcony", "id": 101, "trainId": 346},
409 |
{"name": "volleyball court", "id": 2969, "trainId": 347},
410 |
{"name": "table tennis", "id": 2692, "trainId": 348},
411 |
{"name": "control table", "id": 606, "trainId": 349},
412 |
{"name": "shirt", "id": 2339, "trainId": 350},
413 |
{"name": "merchandise, ware, product", "id": 1533, "trainId": 351},
414 |
{"name": "railway", "id": 2060, "trainId": 352},
415 |
{"name": "parterre", "id": 1782, "trainId": 353},
416 |
{"name": "chimney", "id": 495, "trainId": 354},
417 |
{"name": "can, tin, tin can", "id": 371, "trainId": 355},
418 |
{"name": "tanks", "id": 2707, "trainId": 356},
419 |
{"name": "fabric, cloth, material, textile", "id": 889, "trainId": 357},
420 |
{"name": "alga, algae", "id": 3156, "trainId": 358},
421 |
{"name": "system", "id": 2683, "trainId": 359},
422 |
{"name": "map", "id": 1499, "trainId": 360},
423 |
{"name": "greenhouse", "id": 1135, "trainId": 361},
424 |
{"name": "mug", "id": 1619, "trainId": 362},
425 |
{"name": "barbecue", "id": 125, "trainId": 363},
426 |
{"name": "trailer", "id": 2838, "trainId": 364},
427 |
{"name": "toilet tissue, toilet paper, bathroom tissue", "id": 2792, "trainId": 365},
428 |
{"name": "organ", "id": 1695, "trainId": 366},
429 |
{"name": "dishrag, dishcloth", "id": 746, "trainId": 367},
430 |
{"name": "island", "id": 1343, "trainId": 368},
431 |
{"name": "keyboard", "id": 1370, "trainId": 369},
432 |
{"name": "trench", "id": 2858, "trainId": 370},
433 |
{"name": "basket, basketball hoop, hoop", "id": 145, "trainId": 371},
434 |
{"name": "steering wheel, wheel", "id": 2565, "trainId": 372},
435 |
{"name": "pitcher, ewer", "id": 1892, "trainId": 373},
436 |
{"name": "goal", "id": 1103, "trainId": 374},
437 |
{"name": "bread, breadstuff, staff of life", "id": 286, "trainId": 375},
438 |
{"name": "beds", "id": 170, "trainId": 376},
439 |
{"name": "wood", "id": 3073, "trainId": 377},
440 |
{"name": "file cabinet", "id": 922, "trainId": 378},
441 |
{"name": "newspaper, paper", "id": 1655, "trainId": 379},
442 |
{"name": "motorboat", "id": 1602, "trainId": 380},
443 |
{"name": "rope", "id": 2160, "trainId": 381},
444 |
{"name": "guitar", "id": 1151, "trainId": 382},
445 |
{"name": "rubble", "id": 2176, "trainId": 383},
446 |
{"name": "scarf", "id": 2239, "trainId": 384},
447 |
{"name": "barrels", "id": 132, "trainId": 385},
448 |
{"name": "cap", "id": 394, "trainId": 386},
449 |
{"name": "leaves", "id": 1424, "trainId": 387},
450 |
{"name": "control tower", "id": 607, "trainId": 388},
451 |
{"name": "dashboard", "id": 700, "trainId": 389},
452 |
{"name": "bandstand", "id": 116, "trainId": 390},
453 |
{"name": "lectern", "id": 1425, "trainId": 391},
454 |
{"name": "switch, electric switch, electrical switch", "id": 2676, "trainId": 392},
455 |
{"name": "baseboard, mopboard, skirting board", "id": 141, "trainId": 393},
456 |
{"name": "shower room", "id": 2360, "trainId": 394},
457 |
{"name": "smoke", "id": 2449, "trainId": 395},
458 |
{"name": "faucet, spigot", "id": 897, "trainId": 396},
459 |
{"name": "bulldozer", "id": 317, "trainId": 397},
460 |
{"name": "saucepan", "id": 2228, "trainId": 398},
461 |
{"name": "shops", "id": 2351, "trainId": 399},
462 |
{"name": "meter", "id": 1543, "trainId": 400},
463 |
{"name": "crevasse", "id": 656, "trainId": 401},
464 |
{"name": "gear", "id": 1088, "trainId": 402},
465 |
{"name": "candelabrum, candelabra", "id": 373, "trainId": 403},
466 |
{"name": "sofa bed", "id": 2472, "trainId": 404},
467 |
{"name": "tunnel", "id": 2892, "trainId": 405},
468 |
{"name": "pallet", "id": 1740, "trainId": 406},
469 |
{"name": "wire, conducting wire", "id": 3067, "trainId": 407},
470 |
{"name": "kettle, boiler", "id": 1367, "trainId": 408},
471 |
{"name": "bidet", "id": 188, "trainId": 409},
472 |
473 |
"name": "baby buggy, baby carriage, carriage, perambulator, pram, stroller, go-cart, pushchair, pusher",
474 |
"id": 79,
475 |
"trainId": 410,
476 |
477 |
{"name": "music stand", "id": 1633, "trainId": 411},
478 |
{"name": "pipe, tube", "id": 1885, "trainId": 412},
479 |
{"name": "cup", "id": 677, "trainId": 413},
480 |
{"name": "parking meter", "id": 1779, "trainId": 414},
481 |
{"name": "ice hockey rink", "id": 1297, "trainId": 415},
482 |
{"name": "shelter", "id": 2334, "trainId": 416},
483 |
{"name": "weeds", "id": 3027, "trainId": 417},
484 |
{"name": "temple", "id": 2735, "trainId": 418},
485 |
{"name": "patty, cake", "id": 1791, "trainId": 419},
486 |
{"name": "ski slope", "id": 2405, "trainId": 420},
487 |
{"name": "panel", "id": 1748, "trainId": 421},
488 |
{"name": "wallet", "id": 2983, "trainId": 422},
489 |
{"name": "wheel", "id": 3035, "trainId": 423},
490 |
{"name": "towel rack, towel horse", "id": 2824, "trainId": 424},
491 |
{"name": "roundabout", "id": 2168, "trainId": 425},
492 |
{"name": "canister, cannister, tin", "id": 385, "trainId": 426},
493 |
{"name": "rod", "id": 2148, "trainId": 427},
494 |
{"name": "soap dispenser", "id": 2465, "trainId": 428},
495 |
{"name": "bell", "id": 175, "trainId": 429},
496 |
{"name": "canvas", "id": 390, "trainId": 430},
497 |
{"name": "box office, ticket office, ticket booth", "id": 268, "trainId": 431},
498 |
{"name": "teacup", "id": 2722, "trainId": 432},
499 |
{"name": "trellis", "id": 2857, "trainId": 433},
500 |
{"name": "workbench", "id": 3088, "trainId": 434},
501 |
{"name": "valley, vale", "id": 2926, "trainId": 435},
502 |
{"name": "toaster", "id": 2782, "trainId": 436},
503 |
{"name": "knife", "id": 1378, "trainId": 437},
504 |
{"name": "podium", "id": 1934, "trainId": 438},
505 |
{"name": "ramp", "id": 2072, "trainId": 439},
506 |
{"name": "tumble dryer", "id": 2889, "trainId": 440},
507 |
{"name": "fireplug, fire hydrant, plug", "id": 944, "trainId": 441},
508 |
{"name": "gym shoe, sneaker, tennis shoe", "id": 1158, "trainId": 442},
509 |
{"name": "lab bench", "id": 1383, "trainId": 443},
510 |
{"name": "equipment", "id": 867, "trainId": 444},
511 |
{"name": "rocky formation", "id": 2145, "trainId": 445},
512 |
{"name": "plastic", "id": 1915, "trainId": 446},
513 |
{"name": "calendar", "id": 361, "trainId": 447},
514 |
{"name": "caravan", "id": 402, "trainId": 448},
515 |
{"name": "check-in-desk", "id": 482, "trainId": 449},
516 |
{"name": "ticket counter", "id": 2761, "trainId": 450},
517 |
{"name": "brush", "id": 300, "trainId": 451},
518 |
{"name": "mill", "id": 1554, "trainId": 452},
519 |
{"name": "covered bridge", "id": 636, "trainId": 453},
520 |
{"name": "bowling alley", "id": 260, "trainId": 454},
521 |
{"name": "hanger", "id": 1186, "trainId": 455},
522 |
{"name": "excavator", "id": 871, "trainId": 456},
523 |
{"name": "trestle", "id": 2859, "trainId": 457},
524 |
{"name": "revolving door", "id": 2103, "trainId": 458},
525 |
{"name": "blast furnace", "id": 208, "trainId": 459},
526 |
{"name": "scale, weighing machine", "id": 2236, "trainId": 460},
527 |
{"name": "projector", "id": 2012, "trainId": 461},
528 |
{"name": "soap", "id": 2462, "trainId": 462},
529 |
{"name": "locker", "id": 1462, "trainId": 463},
530 |
{"name": "tractor", "id": 2832, "trainId": 464},
531 |
{"name": "stretcher", "id": 2617, "trainId": 465},
532 |
{"name": "frame", "id": 1024, "trainId": 466},
533 |
{"name": "grating", "id": 1129, "trainId": 467},
534 |
{"name": "alembic", "id": 18, "trainId": 468},
535 |
{"name": "candle, taper, wax light", "id": 376, "trainId": 469},
536 |
{"name": "barrier", "id": 134, "trainId": 470},
537 |
{"name": "cardboard", "id": 407, "trainId": 471},
538 |
{"name": "cave", "id": 434, "trainId": 472},
539 |
{"name": "puddle", "id": 2017, "trainId": 473},
540 |
{"name": "tarp", "id": 2717, "trainId": 474},
541 |
{"name": "price tag", "id": 2005, "trainId": 475},
542 |
{"name": "watchtower", "id": 2993, "trainId": 476},
543 |
{"name": "meters", "id": 1545, "trainId": 477},
544 |
545 |
"name": "light bulb, lightbulb, bulb, incandescent lamp, electric light, electric-light bulb",
546 |
"id": 1445,
547 |
"trainId": 478,
548 |
549 |
{"name": "tracks", "id": 2831, "trainId": 479},
550 |
{"name": "hair dryer", "id": 1161, "trainId": 480},
551 |
{"name": "skirt", "id": 2411, "trainId": 481},
552 |
{"name": "viaduct", "id": 2949, "trainId": 482},
553 |
{"name": "paper towel", "id": 1769, "trainId": 483},
554 |
{"name": "coat", "id": 552, "trainId": 484},
555 |
{"name": "sheet", "id": 2327, "trainId": 485},
556 |
{"name": "fire extinguisher, extinguisher, asphyxiator", "id": 939, "trainId": 486},
557 |
{"name": "water wheel", "id": 3013, "trainId": 487},
558 |
{"name": "pottery, clayware", "id": 1986, "trainId": 488},
559 |
{"name": "magazine rack", "id": 1486, "trainId": 489},
560 |
{"name": "teapot", "id": 2723, "trainId": 490},
561 |
{"name": "microphone, mike", "id": 1549, "trainId": 491},
562 |
{"name": "support", "id": 2649, "trainId": 492},
563 |
{"name": "forklift", "id": 1020, "trainId": 493},
564 |
{"name": "canyon", "id": 392, "trainId": 494},
565 |
{"name": "cash register, register", "id": 422, "trainId": 495},
566 |
{"name": "leaf, leafage, foliage", "id": 1419, "trainId": 496},
567 |
{"name": "remote control, remote", "id": 2099, "trainId": 497},
568 |
{"name": "soap dish", "id": 2464, "trainId": 498},
569 |
{"name": "windshield, windscreen", "id": 3058, "trainId": 499},
570 |
{"name": "cat", "id": 430, "trainId": 500},
571 |
{"name": "cue, cue stick, pool cue, pool stick", "id": 675, "trainId": 501},
572 |
{"name": "vent, venthole, vent-hole, blowhole", "id": 2941, "trainId": 502},
573 |
{"name": "videos", "id": 2955, "trainId": 503},
574 |
{"name": "shovel", "id": 2355, "trainId": 504},
575 |
{"name": "eaves", "id": 840, "trainId": 505},
576 |
{"name": "antenna, aerial, transmitting aerial", "id": 32, "trainId": 506},
577 |
{"name": "shipyard", "id": 2338, "trainId": 507},
578 |
{"name": "hen, biddy", "id": 1232, "trainId": 508},
579 |
{"name": "traffic cone", "id": 2834, "trainId": 509},
580 |
{"name": "washing machines", "id": 2991, "trainId": 510},
581 |
{"name": "truck crane", "id": 2879, "trainId": 511},
582 |
{"name": "cds", "id": 444, "trainId": 512},
583 |
{"name": "niche", "id": 1657, "trainId": 513},
584 |
{"name": "scoreboard", "id": 2246, "trainId": 514},
585 |
{"name": "briefcase", "id": 296, "trainId": 515},
586 |
{"name": "boot", "id": 245, "trainId": 516},
587 |
{"name": "sweater, jumper", "id": 2661, "trainId": 517},
588 |
{"name": "hay", "id": 1202, "trainId": 518},
589 |
{"name": "pack", "id": 1714, "trainId": 519},
590 |
{"name": "bottle rack", "id": 251, "trainId": 520},
591 |
{"name": "glacier", "id": 1095, "trainId": 521},
592 |
{"name": "pergola", "id": 1828, "trainId": 522},
593 |
{"name": "building materials", "id": 311, "trainId": 523},
594 |
{"name": "television camera", "id": 2732, "trainId": 524},
595 |
{"name": "first floor", "id": 947, "trainId": 525},
596 |
{"name": "rifle", "id": 2115, "trainId": 526},
597 |
{"name": "tennis table", "id": 2738, "trainId": 527},
598 |
{"name": "stadium", "id": 2525, "trainId": 528},
599 |
{"name": "safety belt", "id": 2194, "trainId": 529},
600 |
{"name": "cover", "id": 634, "trainId": 530},
601 |
{"name": "dish rack", "id": 740, "trainId": 531},
602 |
{"name": "synthesizer", "id": 2682, "trainId": 532},
603 |
{"name": "pumpkin", "id": 2020, "trainId": 533},
604 |
{"name": "gutter", "id": 1156, "trainId": 534},
605 |
{"name": "fruit stand", "id": 1036, "trainId": 535},
606 |
{"name": "ice floe, floe", "id": 1295, "trainId": 536},
607 |
{"name": "handle, grip, handgrip, hold", "id": 1181, "trainId": 537},
608 |
{"name": "wheelchair", "id": 3037, "trainId": 538},
609 |
{"name": "mousepad, mouse mat", "id": 1614, "trainId": 539},
610 |
{"name": "diploma", "id": 736, "trainId": 540},
611 |
{"name": "fairground ride", "id": 893, "trainId": 541},
612 |
{"name": "radio", "id": 2047, "trainId": 542},
613 |
{"name": "hotplate", "id": 1274, "trainId": 543},
614 |
{"name": "junk", "id": 1361, "trainId": 544},
615 |
{"name": "wheelbarrow", "id": 3036, "trainId": 545},
616 |
{"name": "stream", "id": 2606, "trainId": 546},
617 |
{"name": "toll plaza", "id": 2797, "trainId": 547},
618 |
{"name": "punching bag", "id": 2022, "trainId": 548},
619 |
{"name": "trough", "id": 2876, "trainId": 549},
620 |
{"name": "throne", "id": 2758, "trainId": 550},
621 |
{"name": "chair desk", "id": 472, "trainId": 551},
622 |
{"name": "weighbridge", "id": 3028, "trainId": 552},
623 |
{"name": "extractor fan", "id": 882, "trainId": 553},
624 |
{"name": "hanging clothes", "id": 1189, "trainId": 554},
625 |
{"name": "dish, dish aerial, dish antenna, saucer", "id": 743, "trainId": 555},
626 |
{"name": "alarm clock, alarm", "id": 3122, "trainId": 556},
627 |
{"name": "ski lift", "id": 2401, "trainId": 557},
628 |
{"name": "chain", "id": 468, "trainId": 558},
629 |
{"name": "garage", "id": 1061, "trainId": 559},
630 |
{"name": "mechanical shovel", "id": 1523, "trainId": 560},
631 |
{"name": "wine rack", "id": 3059, "trainId": 561},
632 |
{"name": "tramway", "id": 2843, "trainId": 562},
633 |
{"name": "treadmill", "id": 2853, "trainId": 563},
634 |
{"name": "menu", "id": 1529, "trainId": 564},
635 |
{"name": "block", "id": 214, "trainId": 565},
636 |
{"name": "well", "id": 3032, "trainId": 566},
637 |
{"name": "witness stand", "id": 3071, "trainId": 567},
638 |
{"name": "branch", "id": 277, "trainId": 568},
639 |
{"name": "duck", "id": 813, "trainId": 569},
640 |
{"name": "casserole", "id": 426, "trainId": 570},
641 |
{"name": "frying pan", "id": 1039, "trainId": 571},
642 |
{"name": "desk organizer", "id": 727, "trainId": 572},
643 |
{"name": "mast", "id": 1508, "trainId": 573},
644 |
{"name": "spectacles, specs, eyeglasses, glasses", "id": 2490, "trainId": 574},
645 |
{"name": "service elevator", "id": 2299, "trainId": 575},
646 |
{"name": "dollhouse", "id": 768, "trainId": 576},
647 |
{"name": "hammock", "id": 1172, "trainId": 577},
648 |
{"name": "clothes hanging", "id": 537, "trainId": 578},
649 |
{"name": "photocopier", "id": 1847, "trainId": 579},
650 |
{"name": "notepad", "id": 1664, "trainId": 580},
651 |
{"name": "golf cart", "id": 1110, "trainId": 581},
652 |
{"name": "footpath", "id": 1014, "trainId": 582},
653 |
{"name": "cross", "id": 662, "trainId": 583},
654 |
{"name": "baptismal font", "id": 121, "trainId": 584},
655 |
{"name": "boiler", "id": 227, "trainId": 585},
656 |
{"name": "skip", "id": 2410, "trainId": 586},
657 |
{"name": "rotisserie", "id": 2165, "trainId": 587},
658 |
{"name": "tables", "id": 2696, "trainId": 588},
659 |
{"name": "water mill", "id": 3005, "trainId": 589},
660 |
{"name": "helmet", "id": 1231, "trainId": 590},
661 |
{"name": "cover curtain", "id": 635, "trainId": 591},
662 |
{"name": "brick", "id": 292, "trainId": 592},
663 |
{"name": "table runner", "id": 2690, "trainId": 593},
664 |
{"name": "ashtray", "id": 65, "trainId": 594},
665 |
{"name": "street box", "id": 2607, "trainId": 595},
666 |
{"name": "stick", "id": 2574, "trainId": 596},
667 |
{"name": "hangers", "id": 1188, "trainId": 597},
668 |
{"name": "cells", "id": 456, "trainId": 598},
669 |
{"name": "urinal", "id": 2913, "trainId": 599},
670 |
{"name": "centerpiece", "id": 459, "trainId": 600},
671 |
{"name": "portable fridge", "id": 1955, "trainId": 601},
672 |
{"name": "dvds", "id": 827, "trainId": 602},
673 |
{"name": "golf club", "id": 1111, "trainId": 603},
674 |
{"name": "skirting board", "id": 2412, "trainId": 604},
675 |
{"name": "water cooler", "id": 2997, "trainId": 605},
676 |
{"name": "clipboard", "id": 528, "trainId": 606},
677 |
{"name": "camera, photographic camera", "id": 366, "trainId": 607},
678 |
{"name": "pigeonhole", "id": 1863, "trainId": 608},
679 |
{"name": "chips", "id": 500, "trainId": 609},
680 |
{"name": "food processor", "id": 1001, "trainId": 610},
681 |
{"name": "post box", "id": 1958, "trainId": 611},
682 |
{"name": "lid", "id": 1441, "trainId": 612},
683 |
{"name": "drum", "id": 809, "trainId": 613},
684 |
{"name": "blender", "id": 210, "trainId": 614},
685 |
{"name": "cave entrance", "id": 435, "trainId": 615},
686 |
{"name": "dental chair", "id": 718, "trainId": 616},
687 |
{"name": "obelisk", "id": 1674, "trainId": 617},
688 |
{"name": "canoe", "id": 388, "trainId": 618},
689 |
{"name": "mobile", "id": 1572, "trainId": 619},
690 |
{"name": "monitors", "id": 1584, "trainId": 620},
691 |
{"name": "pool ball", "id": 1944, "trainId": 621},
692 |
{"name": "cue rack", "id": 674, "trainId": 622},
693 |
{"name": "baggage carts", "id": 99, "trainId": 623},
694 |
{"name": "shore", "id": 2352, "trainId": 624},
695 |
{"name": "fork", "id": 1019, "trainId": 625},
696 |
{"name": "paper filer", "id": 1763, "trainId": 626},
697 |
{"name": "bicycle rack", "id": 185, "trainId": 627},
698 |
{"name": "coat rack", "id": 554, "trainId": 628},
699 |
{"name": "garland", "id": 1066, "trainId": 629},
700 |
{"name": "sports bag", "id": 2508, "trainId": 630},
701 |
{"name": "fish tank", "id": 951, "trainId": 631},
702 |
{"name": "towel dispenser", "id": 2822, "trainId": 632},
703 |
{"name": "carriage", "id": 415, "trainId": 633},
704 |
{"name": "brochure", "id": 297, "trainId": 634},
705 |
{"name": "plaque", "id": 1914, "trainId": 635},
706 |
{"name": "stringer", "id": 2619, "trainId": 636},
707 |
{"name": "iron", "id": 1338, "trainId": 637},
708 |
{"name": "spoon", "id": 2505, "trainId": 638},
709 |
{"name": "flag pole", "id": 955, "trainId": 639},
710 |
{"name": "toilet brush", "id": 2786, "trainId": 640},
711 |
{"name": "book stand", "id": 238, "trainId": 641},
712 |
{"name": "water faucet, water tap, tap, hydrant", "id": 3000, "trainId": 642},
713 |
{"name": "ticket office", "id": 2763, "trainId": 643},
714 |
{"name": "broom", "id": 299, "trainId": 644},
715 |
{"name": "dvd", "id": 822, "trainId": 645},
716 |
{"name": "ice bucket", "id": 1288, "trainId": 646},
717 |
{"name": "carapace, shell, cuticle, shield", "id": 3101, "trainId": 647},
718 |
{"name": "tureen", "id": 2894, "trainId": 648},
719 |
{"name": "folders", "id": 992, "trainId": 649},
720 |
{"name": "chess", "id": 489, "trainId": 650},
721 |
{"name": "root", "id": 2157, "trainId": 651},
722 |
{"name": "sewing machine", "id": 2309, "trainId": 652},
723 |
{"name": "model", "id": 1576, "trainId": 653},
724 |
{"name": "pen", "id": 1810, "trainId": 654},
725 |
{"name": "violin", "id": 2964, "trainId": 655},
726 |
{"name": "sweatshirt", "id": 2662, "trainId": 656},
727 |
{"name": "recycling materials", "id": 2087, "trainId": 657},
728 |
{"name": "mitten", "id": 1569, "trainId": 658},
729 |
{"name": "chopping board, cutting board", "id": 503, "trainId": 659},
730 |
{"name": "mask", "id": 1505, "trainId": 660},
731 |
{"name": "log", "id": 1468, "trainId": 661},
732 |
{"name": "mouse, computer mouse", "id": 1613, "trainId": 662},
733 |
{"name": "grill", "id": 1138, "trainId": 663},
734 |
{"name": "hole", "id": 1256, "trainId": 664},
735 |
{"name": "target", "id": 2715, "trainId": 665},
736 |
{"name": "trash bag", "id": 2846, "trainId": 666},
737 |
{"name": "chalk", "id": 477, "trainId": 667},
738 |
{"name": "sticks", "id": 2576, "trainId": 668},
739 |
{"name": "balloon", "id": 108, "trainId": 669},
740 |
{"name": "score", "id": 2245, "trainId": 670},
741 |
{"name": "hair spray", "id": 1162, "trainId": 671},
742 |
{"name": "roll", "id": 2149, "trainId": 672},
743 |
{"name": "runner", "id": 2183, "trainId": 673},
744 |
{"name": "engine", "id": 858, "trainId": 674},
745 |
{"name": "inflatable glove", "id": 1324, "trainId": 675},
746 |
{"name": "games", "id": 1055, "trainId": 676},
747 |
{"name": "pallets", "id": 1741, "trainId": 677},
748 |
{"name": "baskets", "id": 149, "trainId": 678},
749 |
{"name": "coop", "id": 615, "trainId": 679},
750 |
{"name": "dvd player", "id": 825, "trainId": 680},
751 |
{"name": "rocking horse", "id": 2143, "trainId": 681},
752 |
{"name": "buckets", "id": 304, "trainId": 682},
753 |
{"name": "bread rolls", "id": 283, "trainId": 683},
754 |
{"name": "shawl", "id": 2322, "trainId": 684},
755 |
{"name": "watering can", "id": 3017, "trainId": 685},
756 |
{"name": "spotlights", "id": 2510, "trainId": 686},
757 |
{"name": "post-it", "id": 1960, "trainId": 687},
758 |
{"name": "bowls", "id": 265, "trainId": 688},
759 |
{"name": "security camera", "id": 2282, "trainId": 689},
760 |
{"name": "runner cloth", "id": 2184, "trainId": 690},
761 |
{"name": "lock", "id": 1461, "trainId": 691},
762 |
{"name": "alarm, warning device, alarm system", "id": 3113, "trainId": 692},
763 |
{"name": "side", "id": 2372, "trainId": 693},
764 |
{"name": "roulette", "id": 2166, "trainId": 694},
765 |
{"name": "bone", "id": 232, "trainId": 695},
766 |
{"name": "cutlery", "id": 693, "trainId": 696},
767 |
{"name": "pool balls", "id": 1945, "trainId": 697},
768 |
{"name": "wheels", "id": 3039, "trainId": 698},
769 |
{"name": "spice rack", "id": 2494, "trainId": 699},
770 |
{"name": "plant pots", "id": 1908, "trainId": 700},
771 |
{"name": "towel ring", "id": 2827, "trainId": 701},
772 |
{"name": "bread box", "id": 280, "trainId": 702},
773 |
{"name": "video", "id": 2950, "trainId": 703},
774 |
{"name": "funfair", "id": 1044, "trainId": 704},
775 |
{"name": "breads", "id": 288, "trainId": 705},
776 |
{"name": "tripod", "id": 2863, "trainId": 706},
777 |
{"name": "ironing board", "id": 1342, "trainId": 707},
778 |
{"name": "skimmer", "id": 2409, "trainId": 708},
779 |
{"name": "hollow", "id": 1258, "trainId": 709},
780 |
{"name": "scratching post", "id": 2249, "trainId": 710},
781 |
{"name": "tricycle", "id": 2862, "trainId": 711},
782 |
{"name": "file box", "id": 920, "trainId": 712},
783 |
{"name": "mountain pass", "id": 1607, "trainId": 713},
784 |
{"name": "tombstones", "id": 2802, "trainId": 714},
785 |
{"name": "cooker", "id": 610, "trainId": 715},
786 |
{"name": "card game, cards", "id": 3129, "trainId": 716},
787 |
{"name": "golf bag", "id": 1108, "trainId": 717},
788 |
{"name": "towel paper", "id": 2823, "trainId": 718},
789 |
{"name": "chaise lounge", "id": 476, "trainId": 719},
790 |
{"name": "sun", "id": 2641, "trainId": 720},
791 |
{"name": "toilet paper holder", "id": 2788, "trainId": 721},
792 |
{"name": "rake", "id": 2070, "trainId": 722},
793 |
{"name": "key", "id": 1368, "trainId": 723},
794 |
{"name": "umbrella stand", "id": 2903, "trainId": 724},
795 |
{"name": "dartboard", "id": 699, "trainId": 725},
796 |
{"name": "transformer", "id": 2844, "trainId": 726},
797 |
{"name": "fireplace utensils", "id": 942, "trainId": 727},
798 |
{"name": "sweatshirts", "id": 2663, "trainId": 728},
799 |
800 |
"name": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
801 |
"id": 457,
802 |
"trainId": 729,
803 |
804 |
{"name": "tallboy", "id": 2701, "trainId": 730},
805 |
{"name": "stapler", "id": 2540, "trainId": 731},
806 |
{"name": "sauna", "id": 2231, "trainId": 732},
807 |
{"name": "test tube", "id": 2746, "trainId": 733},
808 |
{"name": "palette", "id": 1738, "trainId": 734},
809 |
{"name": "shopping carts", "id": 2350, "trainId": 735},
810 |
{"name": "tools", "id": 2808, "trainId": 736},
811 |
{"name": "push button, push, button", "id": 2025, "trainId": 737},
812 |
{"name": "star", "id": 2541, "trainId": 738},
813 |
{"name": "roof rack", "id": 2156, "trainId": 739},
814 |
{"name": "barbed wire", "id": 126, "trainId": 740},
815 |
{"name": "spray", "id": 2512, "trainId": 741},
816 |
{"name": "ear", "id": 831, "trainId": 742},
817 |
{"name": "sponge", "id": 2503, "trainId": 743},
818 |
{"name": "racket", "id": 2039, "trainId": 744},
819 |
{"name": "tins", "id": 2774, "trainId": 745},
820 |
{"name": "eyeglasses", "id": 886, "trainId": 746},
821 |
{"name": "file", "id": 919, "trainId": 747},
822 |
{"name": "scarfs", "id": 2240, "trainId": 748},
823 |
{"name": "sugar bowl", "id": 2636, "trainId": 749},
824 |
{"name": "flip flop", "id": 963, "trainId": 750},
825 |
{"name": "headstones", "id": 1218, "trainId": 751},
826 |
{"name": "laptop bag", "id": 1406, "trainId": 752},
827 |
{"name": "leash", "id": 1420, "trainId": 753},
828 |
{"name": "climbing frame", "id": 526, "trainId": 754},
829 |
{"name": "suit hanger", "id": 2639, "trainId": 755},
830 |
{"name": "floor spotlight", "id": 975, "trainId": 756},
831 |
{"name": "plate rack", "id": 1921, "trainId": 757},
832 |
{"name": "sewer", "id": 2305, "trainId": 758},
833 |
{"name": "hard drive", "id": 1193, "trainId": 759},
834 |
{"name": "sprinkler", "id": 2517, "trainId": 760},
835 |
{"name": "tools box", "id": 2809, "trainId": 761},
836 |
{"name": "necklace", "id": 1647, "trainId": 762},
837 |
{"name": "bulbs", "id": 314, "trainId": 763},
838 |
{"name": "steel industry", "id": 2560, "trainId": 764},
839 |
{"name": "club", "id": 545, "trainId": 765},
840 |
{"name": "jack", "id": 1345, "trainId": 766},
841 |
{"name": "door bars", "id": 775, "trainId": 767},
842 |
843 |
"name": "control panel, instrument panel, control board, board, panel",
844 |
"id": 603,
845 |
"trainId": 768,
846 |
847 |
{"name": "hairbrush", "id": 1163, "trainId": 769},
848 |
{"name": "napkin holder", "id": 1641, "trainId": 770},
849 |
{"name": "office", "id": 1678, "trainId": 771},
850 |
{"name": "smoke detector", "id": 2450, "trainId": 772},
851 |
{"name": "utensils", "id": 2915, "trainId": 773},
852 |
{"name": "apron", "id": 42, "trainId": 774},
853 |
{"name": "scissors", "id": 2242, "trainId": 775},
854 |
{"name": "terminal", "id": 2741, "trainId": 776},
855 |
{"name": "grinder", "id": 1143, "trainId": 777},
856 |
{"name": "entry phone", "id": 862, "trainId": 778},
857 |
{"name": "newspaper stand", "id": 1654, "trainId": 779},
858 |
{"name": "pepper shaker", "id": 1826, "trainId": 780},
859 |
{"name": "onions", "id": 1689, "trainId": 781},
860 |
861 |
"name": "central processing unit, cpu, c p u , central processor, processor, mainframe",
862 |
"id": 3124,
863 |
"trainId": 782,
864 |
865 |
{"name": "tape", "id": 2710, "trainId": 783},
866 |
{"name": "bat", "id": 152, "trainId": 784},
867 |
{"name": "coaster", "id": 549, "trainId": 785},
868 |
{"name": "calculator", "id": 360, "trainId": 786},
869 |
{"name": "potatoes", "id": 1982, "trainId": 787},
870 |
{"name": "luggage rack", "id": 1478, "trainId": 788},
871 |
{"name": "salt", "id": 2203, "trainId": 789},
872 |
{"name": "street number", "id": 2612, "trainId": 790},
873 |
{"name": "viewpoint", "id": 2956, "trainId": 791},
874 |
{"name": "sword", "id": 2681, "trainId": 792},
875 |
{"name": "cd", "id": 437, "trainId": 793},
876 |
{"name": "rowing machine", "id": 2171, "trainId": 794},
877 |
{"name": "plug", "id": 1933, "trainId": 795},
878 |
{"name": "andiron, firedog, dog, dog-iron", "id": 3110, "trainId": 796},
879 |
{"name": "pepper", "id": 1824, "trainId": 797},
880 |
{"name": "tongs", "id": 2803, "trainId": 798},
881 |
{"name": "bonfire", "id": 234, "trainId": 799},
882 |
{"name": "dog dish", "id": 764, "trainId": 800},
883 |
{"name": "belt", "id": 177, "trainId": 801},
884 |
{"name": "dumbbells", "id": 817, "trainId": 802},
885 |
{"name": "videocassette recorder, vcr", "id": 3145, "trainId": 803},
886 |
{"name": "hook", "id": 1262, "trainId": 804},
887 |
{"name": "envelopes", "id": 864, "trainId": 805},
888 |
{"name": "shower faucet", "id": 2359, "trainId": 806},
889 |
{"name": "watch", "id": 2992, "trainId": 807},
890 |
{"name": "padlock", "id": 1725, "trainId": 808},
891 |
{"name": "swimming pool ladder", "id": 2667, "trainId": 809},
892 |
{"name": "spanners", "id": 2484, "trainId": 810},
893 |
{"name": "gravy boat", "id": 1133, "trainId": 811},
894 |
{"name": "notice board", "id": 1667, "trainId": 812},
895 |
{"name": "trash bags", "id": 2847, "trainId": 813},
896 |
{"name": "fire alarm", "id": 932, "trainId": 814},
897 |
{"name": "ladle", "id": 1392, "trainId": 815},
898 |
{"name": "stethoscope", "id": 2573, "trainId": 816},
899 |
{"name": "rocket", "id": 2140, "trainId": 817},
900 |
{"name": "funnel", "id": 1046, "trainId": 818},
901 |
{"name": "bowling pins", "id": 264, "trainId": 819},
902 |
{"name": "valve", "id": 2927, "trainId": 820},
903 |
{"name": "thermometer", "id": 2752, "trainId": 821},
904 |
{"name": "cups", "id": 679, "trainId": 822},
905 |
{"name": "spice jar", "id": 2493, "trainId": 823},
906 |
{"name": "night light", "id": 1658, "trainId": 824},
907 |
{"name": "soaps", "id": 2466, "trainId": 825},
908 |
{"name": "games table", "id": 1057, "trainId": 826},
909 |
{"name": "slotted spoon", "id": 2444, "trainId": 827},
910 |
{"name": "reel", "id": 2093, "trainId": 828},
911 |
{"name": "scourer", "id": 2248, "trainId": 829},
912 |
{"name": "sleeping robe", "id": 2432, "trainId": 830},
913 |
{"name": "desk mat", "id": 726, "trainId": 831},
914 |
{"name": "dumbbell", "id": 816, "trainId": 832},
915 |
{"name": "hammer", "id": 1171, "trainId": 833},
916 |
{"name": "tie", "id": 2766, "trainId": 834},
917 |
{"name": "typewriter", "id": 2900, "trainId": 835},
918 |
{"name": "shaker", "id": 2313, "trainId": 836},
919 |
{"name": "cheese dish", "id": 488, "trainId": 837},
920 |
{"name": "sea star", "id": 2265, "trainId": 838},
921 |
{"name": "racquet", "id": 2043, "trainId": 839},
922 |
{"name": "butane gas cylinder", "id": 332, "trainId": 840},
923 |
{"name": "paper weight", "id": 1771, "trainId": 841},
924 |
{"name": "shaving brush", "id": 2320, "trainId": 842},
925 |
{"name": "sunglasses", "id": 2646, "trainId": 843},
926 |
{"name": "gear shift", "id": 1089, "trainId": 844},
927 |
{"name": "towel rail", "id": 2826, "trainId": 845},
928 |
{"name": "adding machine, totalizer, totaliser", "id": 3148, "trainId": 846},
929 |
930 |
931 |
932 |
def loadAde20K(file):
933 |
fileseg = file.replace(".jpg", "_seg.png")
934 |
with as io:
935 |
seg = np.array(io)
936 |
937 |
R = seg[:, :, 0]
938 |
G = seg[:, :, 1]
939 |
ObjectClassMasks = (R / 10).astype(np.int32) * 256 + (G.astype(np.int32))
940 |
941 |
return {"img_name": file, "segm_name": fileseg, "class_mask": ObjectClassMasks}
942 |
943 |
944 |
if __name__ == "__main__":
945 |
dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
946 |
index_file = dataset_dir / "ADE20K_2021_17_01" / "index_ade20k.pkl"
947 |
print('Caution: we only generate the validation set!')
948 |
with open(index_file, "rb") as f:
949 |
index_ade20k = pkl.load(f)
950 |
951 |
id_map = {}
952 |
953 |
id_map[cat["id"]] = cat["trainId"]
954 |
955 |
# make output dir
956 |
for name in ["training", "validation"]:
957 |
image_dir = dataset_dir / "ADE20K_2021_17_01" / "images_detectron2" / name
958 |
image_dir.mkdir(parents=True, exist_ok=True)
959 |
annotation_dir = dataset_dir / "ADE20K_2021_17_01" / "annotations_detectron2" / name
960 |
annotation_dir.mkdir(parents=True, exist_ok=True)
961 |
962 |
# process image and gt
963 |
for i, (folder_name, file_name) in tqdm.tqdm(
964 |
enumerate(zip(index_ade20k["folder"], index_ade20k["filename"])),
965 |
966 |
967 |
split = "validation" if file_name.split("_")[1] == "val" else "training"
968 |
if split == 'training':
969 |
# FIXME: If you want to generate training set, delete this condition
970 |
971 |
info = loadAde20K(str(dataset_dir / folder_name / file_name))
972 |
973 |
# resize image and label
974 |
img = np.asarray(["img_name"]))
975 |
lab = np.asarray(info["class_mask"])
976 |
977 |
h, w = img.shape[0], img.shape[1]
978 |
max_size = 512
979 |
resize = True
980 |
if w >= h > max_size:
981 |
h_new, w_new = max_size, round(w / float(h) * max_size)
982 |
elif h >= w > max_size:
983 |
h_new, w_new = round(h / float(w) * max_size), max_size
984 |
985 |
resize = False
986 |
987 |
if resize:
988 |
img = cv2.resize(img, (w_new, h_new), interpolation=cv2.INTER_LINEAR)
989 |
lab = cv2.resize(lab, (w_new, h_new), interpolation=cv2.INTER_NEAREST)
990 |
991 |
assert img.dtype == np.uint8
992 |
assert lab.dtype == np.int32
993 |
994 |
# apply label conversion and save into uint16 images
995 |
output = np.zeros_like(lab, dtype=np.uint16) + 65535
996 |
for obj_id in np.unique(lab):
997 |
if obj_id in id_map:
998 |
output[lab == obj_id] = id_map[obj_id]
999 |
1000 |
output_img = dataset_dir / "ADE20K_2021_17_01" / "images_detectron2" / split / file_name
1001 |
output_lab = (
1002 |
1003 |
/ "ADE20K_2021_17_01"
1004 |
/ "annotations_detectron2"
1005 |
/ split
1006 |
/ file_name.replace(".jpg", ".tif")
1007 |
1008 |
1009 |
1010 |
assert output.dtype == np.uint16
1011 |
@@ -0,0 +1,35 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
import os
5 |
from pathlib import Path
6 |
7 |
import numpy as np
8 |
import tqdm
9 |
from PIL import Image
10 |
11 |
12 |
def convert(input, output, index=None):
13 |
img = np.asarray(
14 |
assert img.dtype == np.uint8
15 |
img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1
16 |
if index is not None:
17 |
mapping = {i: k for k, i in enumerate(index)}
18 |
img = np.vectorize(lambda x: mapping[x] if x in mapping else 255)(
19 |
20 |
21 |
22 |
23 |
24 |
if __name__ == "__main__":
25 |
dataset_dir = (
26 |
Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
27 |
28 |
print('Caution: we only generate the validation set!')
29 |
for name in ["validation"]:
30 |
annotation_dir = dataset_dir / "annotations" / name
31 |
output_dir = dataset_dir / "annotations_detectron2" / name
32 |
output_dir.mkdir(parents=True, exist_ok=True)
33 |
for file in tqdm.tqdm(list(annotation_dir.iterdir())):
34 |
output_file = output_dir /
35 |
convert(file, output_file)
@@ -0,0 +1,219 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
# Modified by Feng Liang from
4 |
5 |
6 |
import os
7 |
import os.path as osp
8 |
from pathlib import Path
9 |
import tqdm
10 |
from glob import glob
11 |
12 |
import numpy as np
13 |
from PIL import Image
14 |
15 |
16 |
full_clsID_to_trID = {
17 |
0: 0,
18 |
1: 1,
19 |
2: 2,
20 |
3: 3,
21 |
4: 4,
22 |
5: 5,
23 |
6: 6,
24 |
7: 7,
25 |
8: 8,
26 |
9: 9,
27 |
10: 10,
28 |
12: 11,
29 |
13: 12,
30 |
14: 13,
31 |
15: 14,
32 |
16: 15,
33 |
17: 16,
34 |
18: 17,
35 |
19: 18,
36 |
20: 19,
37 |
21: 20,
38 |
22: 21,
39 |
23: 22,
40 |
24: 23,
41 |
26: 24,
42 |
27: 25,
43 |
30: 26,
44 |
31: 27,
45 |
32: 28,
46 |
33: 29,
47 |
34: 30,
48 |
35: 31,
49 |
36: 32,
50 |
37: 33,
51 |
38: 34,
52 |
39: 35,
53 |
40: 36,
54 |
41: 37,
55 |
42: 38,
56 |
43: 39,
57 |
45: 40,
58 |
46: 41,
59 |
47: 42,
60 |
48: 43,
61 |
49: 44,
62 |
50: 45,
63 |
51: 46,
64 |
52: 47,
65 |
53: 48,
66 |
54: 49,
67 |
55: 50,
68 |
56: 51,
69 |
57: 52,
70 |
58: 53,
71 |
59: 54,
72 |
60: 55,
73 |
61: 56,
74 |
62: 57,
75 |
63: 58,
76 |
64: 59,
77 |
66: 60,
78 |
69: 61,
79 |
71: 62,
80 |
72: 63,
81 |
73: 64,
82 |
74: 65,
83 |
75: 66,
84 |
76: 67,
85 |
77: 68,
86 |
78: 69,
87 |
79: 70,
88 |
80: 71,
89 |
81: 72,
90 |
83: 73,
91 |
84: 74,
92 |
85: 75,
93 |
86: 76,
94 |
87: 77,
95 |
88: 78,
96 |
89: 79,
97 |
91: 80,
98 |
92: 81,
99 |
93: 82,
100 |
94: 83,
101 |
95: 84,
102 |
96: 85,
103 |
97: 86,
104 |
98: 87,
105 |
99: 88,
106 |
100: 89,
107 |
101: 90,
108 |
102: 91,
109 |
103: 92,
110 |
104: 93,
111 |
105: 94,
112 |
106: 95,
113 |
107: 96,
114 |
108: 97,
115 |
109: 98,
116 |
110: 99,
117 |
111: 100,
118 |
112: 101,
119 |
113: 102,
120 |
114: 103,
121 |
115: 104,
122 |
116: 105,
123 |
117: 106,
124 |
118: 107,
125 |
119: 108,
126 |
120: 109,
127 |
121: 110,
128 |
122: 111,
129 |
123: 112,
130 |
124: 113,
131 |
125: 114,
132 |
126: 115,
133 |
127: 116,
134 |
128: 117,
135 |
129: 118,
136 |
130: 119,
137 |
131: 120,
138 |
132: 121,
139 |
133: 122,
140 |
134: 123,
141 |
135: 124,
142 |
136: 125,
143 |
137: 126,
144 |
138: 127,
145 |
139: 128,
146 |
140: 129,
147 |
141: 130,
148 |
142: 131,
149 |
143: 132,
150 |
144: 133,
151 |
145: 134,
152 |
146: 135,
153 |
147: 136,
154 |
148: 137,
155 |
149: 138,
156 |
150: 139,
157 |
151: 140,
158 |
152: 141,
159 |
153: 142,
160 |
154: 143,
161 |
155: 144,
162 |
156: 145,
163 |
157: 146,
164 |
158: 147,
165 |
159: 148,
166 |
160: 149,
167 |
161: 150,
168 |
162: 151,
169 |
163: 152,
170 |
164: 153,
171 |
165: 154,
172 |
166: 155,
173 |
167: 156,
174 |
168: 157,
175 |
169: 158,
176 |
170: 159,
177 |
171: 160,
178 |
172: 161,
179 |
173: 162,
180 |
174: 163,
181 |
175: 164,
182 |
176: 165,
183 |
177: 166,
184 |
178: 167,
185 |
179: 168,
186 |
180: 169,
187 |
181: 170,
188 |
255: 255,
189 |
190 |
191 |
def convert_to_trainID(
192 |
maskpath, out_mask_dir, is_train, clsID_to_trID=full_clsID_to_trID, suffix=""
193 |
194 |
mask = np.array(
195 |
mask_copy = np.ones_like(mask, dtype=np.uint8) * 255
196 |
for clsID, trID in clsID_to_trID.items():
197 |
mask_copy[mask == clsID] = trID
198 |
seg_filename = (
199 |
osp.join(out_mask_dir, "train2017" + suffix, osp.basename(maskpath))
200 |
if is_train
201 |
else osp.join(out_mask_dir, "val2017" + suffix, osp.basename(maskpath))
202 |
203 |
if len(np.unique(mask_copy)) == 1 and np.unique(mask_copy)[0] == 255:
204 |
205 |
Image.fromarray(mask_copy).save(seg_filename, "PNG")
206 |
207 |
208 |
209 |
if __name__ == "__main__":
210 |
dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
211 |
print('Caution: we only generate the training set!')
212 |
coco_path = dataset_dir / "coco"
213 |
mask_dir = coco_path / "stuffthingmaps"
214 |
out_mask_dir = coco_path / "stuffthingmaps_detectron2"
215 |
for name in ["train2017"]:
216 |
os.makedirs((out_mask_dir / name), exist_ok=True)
217 |
train_list = glob(osp.join(mask_dir, "train2017", "*.png"))
218 |
for file in tqdm.tqdm(train_list):
219 |
convert_to_trainID(file, out_mask_dir, is_train=True)
@@ -0,0 +1,69 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
import tqdm
5 |
import os
6 |
import os.path as osp
7 |
from pathlib import Path
8 |
9 |
import numpy as np
10 |
from PIL import Image
11 |
12 |
13 |
def convert_pc59(mask_path, new_mask_path, pc59_dict):
14 |
mat =
15 |
mask = mat['LabelMap']
16 |
17 |
mask_copy = np.ones_like(mask, dtype=np.uint8) * 255
18 |
for trID, clsID in pc59_dict.items():
19 |
mask_copy[mask == clsID] = trID
20 |
21 |
min_value = np.amin(mask_copy)
22 |
assert min_value >= 0, print(min_value)
23 |
Image.fromarray(mask_copy).save(new_mask_path, "PNG")
24 |
25 |
def convert_pc459(mask_path, new_mask_path):
26 |
mat =
27 |
mask = mat['LabelMap']
28 |
mask = mask - 1
29 |
min_value = np.amin(mask)
30 |
assert min_value >= 0, print(min_value)
31 |
Image.fromarray(mask).save(new_mask_path, "TIFF")
32 |
33 |
34 |
if __name__ == "__main__":
35 |
dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
36 |
print('Caution: we only generate the validation set!')
37 |
pc_path = dataset_dir / "VOCdevkit/VOC2010"
38 |
39 |
val_list = open(pc_path / "pascalcontext_val.txt", "r")
40 |
pc459_labels = open(pc_path / "labels.txt", "r")
41 |
pc59_labels = open(pc_path / "59_labels.txt", "r")
42 |
43 |
pc459_dict = {}
44 |
for line in pc459_labels.readlines():
45 |
if ':' in line:
46 |
idx, name = line.split(':')
47 |
idx = int(idx.strip())
48 |
name = name.strip()
49 |
pc459_dict[name] = idx
50 |
51 |
pc59_dict = {}
52 |
for i, line in enumerate(pc59_labels.readlines()):
53 |
name = line.split(':')[-1].strip()
54 |
if name is not '':
55 |
pc59_dict[i] = pc459_dict[name]
56 |
57 |
pc459_dir = pc_path / "annotations_detectron2" / "pc459_val"
58 |
pc459_dir.mkdir(parents=True, exist_ok=True)
59 |
pc59_dir = pc_path / "annotations_detectron2" / "pc59_val"
60 |
pc59_dir.mkdir(parents=True, exist_ok=True)
61 |
62 |
for line in tqdm.tqdm(val_list.readlines()):
63 |
fileid = line.strip()
64 |
ori_mask = f'{pc_path}/trainval/{fileid}.mat'
65 |
pc459_dst = f'{pc459_dir}/{fileid}.tif'
66 |
pc59_dst = f'{pc59_dir}/{fileid}.png'
67 |
if osp.exists(ori_mask):
68 |
convert_pc459(ori_mask, pc459_dst)
69 |
convert_pc59(ori_mask, pc59_dst, pc59_dict)
@@ -0,0 +1,71 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
# Modified by Feng Liang from
4 |
5 |
import os
6 |
import os.path as osp
7 |
from pathlib import Path
8 |
import tqdm
9 |
10 |
import numpy as np
11 |
from PIL import Image
12 |
13 |
14 |
clsID_to_trID = {
15 |
0: 255,
16 |
1: 0,
17 |
2: 1,
18 |
3: 2,
19 |
4: 3,
20 |
5: 4,
21 |
6: 5,
22 |
7: 6,
23 |
8: 7,
24 |
9: 8,
25 |
10: 9,
26 |
11: 10,
27 |
12: 11,
28 |
13: 12,
29 |
14: 13,
30 |
15: 14,
31 |
16: 15,
32 |
17: 16,
33 |
18: 17,
34 |
19: 18,
35 |
20: 19,
36 |
255: 255,
37 |
38 |
39 |
def convert_to_trainID(
40 |
maskpath, out_mask_dir, is_train, clsID_to_trID=clsID_to_trID, suffix=""
41 |
42 |
mask = np.array(
43 |
mask_copy = np.ones_like(mask, dtype=np.uint8) * 255
44 |
for clsID, trID in clsID_to_trID.items():
45 |
mask_copy[mask == clsID] = trID
46 |
seg_filename = (
47 |
osp.join(out_mask_dir, "train" + suffix, osp.basename(maskpath))
48 |
if is_train
49 |
else osp.join(out_mask_dir, "val" + suffix, osp.basename(maskpath))
50 |
51 |
if len(np.unique(mask_copy)) == 1 and np.unique(mask_copy)[0] == 255:
52 |
53 |
Image.fromarray(mask_copy).save(seg_filename, "PNG")
54 |
55 |
56 |
57 |
if __name__ == "__main__":
58 |
dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
59 |
print('Caution: we only generate the validation set!')
60 |
voc_path = dataset_dir / "VOCdevkit" / "VOC2012"
61 |
out_mask_dir = voc_path / "annotations_detectron2"
62 |
out_image_dir = voc_path / "images_detectron2"
63 |
for name in ["val"]:
64 |
os.makedirs((out_mask_dir / name), exist_ok=True)
65 |
os.makedirs((out_image_dir / name), exist_ok=True)
66 |
val_list = [
67 |
osp.join(voc_path, "SegmentationClassAug", f + ".png")
68 |
for f in np.loadtxt(osp.join(voc_path, "ImageSets/Segmentation/val.txt"), dtype=np.str).tolist()
69 |
70 |
for file in tqdm.tqdm(val_list):
71 |
convert_to_trainID(file, out_mask_dir, is_train=False)
Binary file (6.15 kB). View file
@@ -0,0 +1,9 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
from . import data
5 |
from . import modeling
6 |
from .config import add_ovseg_config
7 |
8 |
from .test_time_augmentation import SemanticSegmentorWithTTA
9 |
from .ovseg_model import OVSeg, OVSegDEMO
@@ -0,0 +1,133 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
from detectron2.config import CfgNode as CN
5 |
6 |
7 |
def add_mask_former_default_config(cfg):
8 |
# data config
9 |
# select the dataset mapper
10 |
cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
11 |
# Color augmentation
12 |
13 |
# We retry random cropping until no single category in semantic segmentation GT occupies more
14 |
# than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
15 |
16 |
# Pad image and segmentation GT in dataset mapper.
17 |
18 |
19 |
# solver config
20 |
# test batch size
21 |
22 |
# weight decay on embedding
23 |
24 |
# optimizer
25 |
26 |
27 |
28 |
# mask_former model config
29 |
30 |
31 |
# loss
32 |
33 |
34 |
35 |
36 |
37 |
# transformer config
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
# mask_former inference config
52 |
53 |
54 |
55 |
56 |
57 |
58 |
# Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
59 |
# you can use this config to override
60 |
61 |
62 |
# pixel decoder config
63 |
64 |
# adding transformer in pixel decoder
65 |
66 |
# pixel decoder
67 |
68 |
69 |
# swin transformer backbone
70 |
71 |
72 |
73 |
74 |
cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
75 |
cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
cfg.MODEL.SWIN.APE = False
87 |
88 |
cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
89 |
90 |
91 |
def add_our_config(cfg):
92 |
93 |
94 |
95 |
# whether to use dense crf
96 |
cfg.TEST.DENSE_CRF = False
97 |
98 |
99 |
# embedding head
100 |
101 |
102 |
103 |
# clip_adapter
104 |
105 |
106 |
# for predefined
107 |
108 |
# for learnable prompt
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
# for mask prompt
119 |
120 |
121 |
122 |
# wandb
123 |
cfg.WANDB = CN()
124 |
cfg.WANDB.PROJECT = "open_vocab_seg"
125 |
cfg.WANDB.NAME = None
126 |
127 |
128 |
def add_ovseg_config(cfg):
129 |
130 |
Add config for open_vocab_seg.
131 |
132 |
133 |
Binary file (6.15 kB). View file
@@ -0,0 +1,9 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
from .dataset_mappers import *
5 |
from . import datasets
6 |
from .build import (
7 |
8 |
9 |
@@ -0,0 +1,202 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
import math
5 |
import numbers
6 |
import numpy as np
7 |
from import Augmentation
8 |
from import (
9 |
10 |
11 |
12 |
13 |
from PIL import Image
14 |
from fvcore.transforms.transform import PadTransform
15 |
16 |
17 |
def mask2box(mask: np.ndarray):
18 |
# use naive way
19 |
row = np.nonzero(mask.sum(axis=0))[0]
20 |
if len(row) == 0:
21 |
return None
22 |
x1 = row.min()
23 |
x2 = row.max()
24 |
col = np.nonzero(mask.sum(axis=1))[0]
25 |
y1 = col.min()
26 |
y2 = col.max()
27 |
return x1, y1, x2 + 1 - x1, y2 + 1 - y1
28 |
29 |
30 |
def expand_box(x, y, w, h, expand_ratio=1.0, max_h=None, max_w=None):
31 |
cx = x + 0.5 * w
32 |
cy = y + 0.5 * h
33 |
w = w * expand_ratio
34 |
h = h * expand_ratio
35 |
box = [cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h]
36 |
if max_h is not None:
37 |
box[1] = max(0, box[1])
38 |
box[3] = min(max_h - 1, box[3])
39 |
if max_w is not None:
40 |
box[0] = max(0, box[0])
41 |
box[2] = min(max_w - 1, box[2])
42 |
box[2] = box[2] - box[0]
43 |
box[3] = box[3] - box[1]
44 |
45 |
return [int(b) for b in box]
46 |
47 |
48 |
class CropImageWithMask(Augmentation):
49 |
def __init__(self, expand_ratio=1.0, mode="choice"):
50 |
if isinstance(expand_ratio, numbers.Number):
51 |
expand_ratio = (expand_ratio, expand_ratio)
52 |
self.mode = mode
53 |
self.expand_ratio = expand_ratio
54 |
if self.mode == "range":
55 |
assert len(expand_ratio) == 2 and expand_ratio[0] < expand_ratio[1]
56 |
57 |
def get_transform(self, image, sem_seg, category_id):
58 |
input_size = image.shape[:2]
59 |
bin_mask = sem_seg == category_id
60 |
x, y, w, h = mask2box(bin_mask)
61 |
if self.mode == "choice":
62 |
expand_ratio = np.random.choice(self.expand_ratio)
63 |
64 |
expand_ratio = np.random.uniform(self.expand_ratio[0], self.expand_ratio[1])
65 |
x, y, w, h = expand_box(x, y, w, h, expand_ratio, *input_size)
66 |
w = max(w, 1)
67 |
h = max(h, 1)
68 |
return CropTransform(x, y, w, h, input_size[1], input_size[0])
69 |
70 |
71 |
class CropImageWithBox(Augmentation):
72 |
def __init__(self, expand_ratio=1.0, mode="choice"):
73 |
if isinstance(expand_ratio, numbers.Number):
74 |
expand_ratio = (expand_ratio, expand_ratio)
75 |
self.mode = mode
76 |
self.expand_ratio = expand_ratio
77 |
if self.mode == "range":
78 |
assert len(expand_ratio) == 2 and expand_ratio[0] < expand_ratio[1]
79 |
80 |
def get_transform(self, image, boxes):
81 |
input_size = image.shape[:2]
82 |
x, y, x2, y2 = boxes[0]
83 |
w = x2 - x + 1
84 |
h = y2 - y + 1
85 |
if self.mode == "choice":
86 |
expand_ratio = np.random.choice(self.expand_ratio)
87 |
88 |
expand_ratio = np.random.uniform(self.expand_ratio[0], self.expand_ratio[1])
89 |
x, y, w, h = expand_box(x, y, w, h, expand_ratio, *input_size)
90 |
w = max(w, 1)
91 |
h = max(h, 1)
92 |
return CropTransform(x, y, w, h, input_size[1], input_size[0])
93 |
94 |
95 |
class RandomResizedCrop(Augmentation):
96 |
def __init__(
97 |
98 |
99 |
scale=(0.08, 1.0),
100 |
ratio=(3.0 / 4.0, 4.0 / 3.0),
101 |
102 |
103 |
if isinstance(size, int):
104 |
size = (size, size)
105 |
106 |
assert isinstance(size, (tuple, list)) and len(size) == 2
107 |
108 |
self.size = size
109 |
110 |
self.scale = scale
111 |
self.ratio = ratio
112 |
self.interpolation = interpolation
113 |
114 |
def get_transform(self, image):
115 |
height, width = image.shape[:2]
116 |
area = height * width
117 |
118 |
log_ratio = np.log(np.array(self.ratio))
119 |
is_success = False
120 |
for _ in range(10):
121 |
target_area = area * np.random.uniform(self.scale[0], self.scale[1])
122 |
aspect_ratio = np.exp(np.random.uniform(log_ratio[0], log_ratio[1]))
123 |
124 |
w = int(round(math.sqrt(target_area * aspect_ratio)))
125 |
h = int(round(math.sqrt(target_area / aspect_ratio)))
126 |
127 |
if 0 < w <= width and 0 < h <= height:
128 |
i = np.random.randint(0, width - w + 1)
129 |
j = np.random.randint(0, height - h + 1)
130 |
131 |
is_success = True
132 |
133 |
134 |
if not is_success:
135 |
# Fallback to central crop
136 |
in_ratio = float(width) / float(height)
137 |
if in_ratio < min(self.ratio):
138 |
w = width
139 |
h = int(round(w / min(self.ratio)))
140 |
elif in_ratio > max(self.ratio):
141 |
h = height
142 |
w = int(round(h * max(self.ratio)))
143 |
else: # whole image
144 |
w = width
145 |
h = height
146 |
i = (width - w) // 2
147 |
j = (height - h) // 2
148 |
return TransformList(
149 |
150 |
CropTransform(i, j, w, h, width, height),
151 |
152 |
h, w, self.size[1], self.size[0], interp=self.interpolation
153 |
154 |
155 |
156 |
157 |
158 |
class CenterCrop(Augmentation):
159 |
def __init__(self, size, seg_ignore_label):
160 |
if isinstance(size, numbers.Number):
161 |
size = (int(size), int(size))
162 |
elif isinstance(size, (tuple, list)) and len(size) == 1:
163 |
size = (size[0], size[0])
164 |
self.size = size
165 |
self.seg_ignore_label = seg_ignore_label
166 |
167 |
def get_transform(self, image):
168 |
169 |
image_height, image_width = image.shape[:2]
170 |
crop_height, crop_width = self.size
171 |
172 |
transforms = []
173 |
if crop_width > image_width or crop_height > image_height:
174 |
padding_ltrb = [
175 |
(crop_width - image_width) // 2 if crop_width > image_width else 0,
176 |
(crop_height - image_height) // 2 if crop_height > image_height else 0,
177 |
(crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
178 |
(crop_height - image_height + 1) // 2
179 |
if crop_height > image_height
180 |
else 0,
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
image_width, image_height = (
191 |
image_width + padding_ltrb[0] + padding_ltrb[2],
192 |
image_height + padding_ltrb[1] + padding_ltrb[3],
193 |
194 |
195 |
crop_top = int(round((image_height - crop_height) / 2.0))
196 |
crop_left = int(round((image_width - crop_width) / 2.0))
197 |
198 |
199 |
crop_left, crop_top, crop_width, crop_height, image_width, image_height
200 |
201 |
202 |
return TransformList(transforms)
@@ -0,0 +1,344 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
import itertools
5 |
import logging
6 |
import numpy as np
7 |
from collections import Counter
8 |
9 |
from tabulate import tabulate
10 |
from termcolor import colored
11 |
12 |
from detectron2.utils.logger import _log_api_usage, log_first_n
13 |
from import DatasetCatalog, MetadataCatalog
14 |
15 |
from detectron2.config import configurable
16 |
from import (
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
from import DatasetFromList, MapDataset
26 |
from import DatasetMapper
27 |
from import check_metadata_consistency
28 |
from import (
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
This file contains the default logic to build a dataloader for training or testing.
37 |
38 |
39 |
__all__ = [
40 |
41 |
42 |
43 |
44 |
45 |
def print_classification_instances_class_histogram(dataset_dicts, class_names):
46 |
47 |
48 |
dataset_dicts (list[dict]): list of dataset dicts.
49 |
class_names (list[str]): list of class names (zero-indexed).
50 |
51 |
num_classes = len(class_names)
52 |
hist_bins = np.arange(num_classes + 1)
53 |
histogram = np.zeros((num_classes,),
54 |
for entry in dataset_dicts:
55 |
classes = np.asarray([entry["category_id"]],
56 |
if len(classes):
57 |
assert classes.min() >= 0, f"Got an invalid category_id={classes.min()}"
58 |
assert (
59 |
classes.max() < num_classes
60 |
), f"Got an invalid category_id={classes.max()} for a dataset of {num_classes} classes"
61 |
histogram += np.histogram(classes, bins=hist_bins)[0]
62 |
63 |
N_COLS = min(6, len(class_names) * 2)
64 |
65 |
def short_name(x):
66 |
# make long class names shorter. useful for lvis
67 |
if len(x) > 13:
68 |
return x[:11] + ".."
69 |
return x
70 |
71 |
data = list(
72 |
73 |
*[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)]
74 |
75 |
76 |
total_num_instances = sum(data[1::2])
77 |
data.extend([None] * (N_COLS - (len(data) % N_COLS)))
78 |
if num_classes > 1:
79 |
data.extend(["total", total_num_instances])
80 |
data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)])
81 |
table = tabulate(
82 |
83 |
headers=["category", "#instances"] * (N_COLS // 2),
84 |
85 |
86 |
87 |
88 |
89 |
90 |
"Distribution of instances among all {} categories:\n".format(num_classes)
91 |
+ colored(table, "cyan"),
92 |
93 |
94 |
95 |
96 |
def wrap_metas(dataset_dict, **kwargs):
97 |
def _assign_attr(data_dict: dict, **kwargs):
98 |
assert not any(
99 |
[key in data_dict for key in kwargs]
100 |
), "Assigned attributes should not exist in the original sample."
101 |
102 |
return data_dict
103 |
104 |
return [_assign_attr(sample, meta=kwargs) for sample in dataset_dict]
105 |
106 |
107 |
def get_detection_dataset_dicts(
108 |
names, filter_empty=True, min_keypoints=0, proposal_files=None
109 |
110 |
111 |
Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
112 |
113 |
114 |
names (str or list[str]): a dataset name or a list of dataset names
115 |
filter_empty (bool): whether to filter out images without instance annotations
116 |
min_keypoints (int): filter out images with fewer keypoints than
117 |
`min_keypoints`. Set to 0 to do nothing.
118 |
proposal_files (list[str]): if given, a list of object proposal files
119 |
that match each dataset in `names`.
120 |
121 |
122 |
list[dict]: a list of dicts following the standard dataset dict format.
123 |
124 |
if isinstance(names, str):
125 |
names = [names]
126 |
assert len(names), names
127 |
dataset_dicts = [
128 |
wrap_metas(DatasetCatalog.get(dataset_name), dataset_name=dataset_name)
129 |
for dataset_name in names
130 |
131 |
for dataset_name, dicts in zip(names, dataset_dicts):
132 |
assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
133 |
134 |
if proposal_files is not None:
135 |
assert len(names) == len(proposal_files)
136 |
# load precomputed proposals from proposal files
137 |
dataset_dicts = [
138 |
load_proposals_into_dataset(dataset_i_dicts, proposal_file)
139 |
for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
140 |
141 |
142 |
dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
143 |
144 |
has_instances = "annotations" in dataset_dicts[0]
145 |
if filter_empty and has_instances:
146 |
dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
147 |
if min_keypoints > 0 and has_instances:
148 |
dataset_dicts = filter_images_with_few_keypoints(dataset_dicts, min_keypoints)
149 |
150 |
if has_instances:
151 |
152 |
class_names = MetadataCatalog.get(names[0]).thing_classes
153 |
check_metadata_consistency("thing_classes", names)
154 |
print_instances_class_histogram(dataset_dicts, class_names)
155 |
except AttributeError: # class names are not available for this dataset
156 |
157 |
158 |
assert len(dataset_dicts), "No valid data found in {}.".format(",".join(names))
159 |
return dataset_dicts
160 |
161 |
162 |
def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None):
163 |
if dataset is None:
164 |
dataset = get_detection_dataset_dicts(
165 |
166 |
167 |
168 |
169 |
else 0,
170 |
171 |
172 |
else None,
173 |
174 |
_log_api_usage("dataset." + cfg.DATASETS.TRAIN[0])
175 |
176 |
if mapper is None:
177 |
mapper = DatasetMapper(cfg, True)
178 |
179 |
if sampler is None:
180 |
181 |
logger = logging.getLogger(__name__)
182 |
+"Using training sampler {}".format(sampler_name))
183 |
if sampler_name == "TrainingSampler":
184 |
sampler = TrainingSampler(len(dataset))
185 |
elif sampler_name == "RepeatFactorTrainingSampler":
186 |
repeat_factors = (
187 |
188 |
189 |
190 |
191 |
sampler = RepeatFactorTrainingSampler(repeat_factors)
192 |
elif sampler_name == "RandomSubsetTrainingSampler":
193 |
sampler = RandomSubsetTrainingSampler(
194 |
195 |
196 |
197 |
raise ValueError("Unknown training sampler: {}".format(sampler_name))
198 |
199 |
return {
200 |
"dataset": dataset,
201 |
"sampler": sampler,
202 |
"mapper": mapper,
203 |
"total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
204 |
"aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
205 |
"num_workers": cfg.DATALOADER.NUM_WORKERS,
206 |
207 |
208 |
209 |
# TODO can allow dataset as an iterable or IterableDataset to make this function more general
210 |
211 |
def build_detection_train_loader(
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
Build a dataloader for object detection with some default features.
222 |
This interface is experimental.
223 |
224 |
225 |
dataset (list or a list of dataset dicts,
226 |
or a map-style pytorch dataset. They can be obtained by using
227 |
:func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
228 |
mapper (callable): a callable which takes a sample (dict) from dataset and
229 |
returns the format to be consumed by the model.
230 |
When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``.
231 |
sampler ( or None): a sampler that produces
232 |
indices to be applied on ``dataset``. Default to :class:`TrainingSampler`,
233 |
which coordinates an infinite random shuffle sequence across all workers.
234 |
total_batch_size (int): total batch size across all workers. Batching
235 |
simply puts data into a list.
236 |
aspect_ratio_grouping (bool): whether to group images with similar
237 |
aspect ratio for efficiency. When enabled, it requires each
238 |
element in dataset be a dict with keys "width" and "height".
239 |
num_workers (int): number of parallel data loading workers
240 |
241 |
242 |
243 |
a dataloader. Each output from it is a ``list[mapped_element]`` of length
244 |
``total_batch_size / num_workers``, where ``mapped_element`` is produced
245 |
by the ``mapper``.
246 |
247 |
if isinstance(dataset, list):
248 |
dataset = DatasetFromList(dataset, copy=False)
249 |
if mapper is not None:
250 |
dataset = MapDataset(dataset, mapper)
251 |
if sampler is None:
252 |
sampler = TrainingSampler(len(dataset))
253 |
assert isinstance(sampler,
254 |
return build_batch_data_loader(
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
def _test_loader_from_config(cfg, dataset_name, mapper=None):
264 |
265 |
Uses the given `dataset_name` argument (instead of the names in cfg), because the
266 |
standard practice is to evaluate each test set individually (not combining them).
267 |
268 |
if isinstance(dataset_name, str):
269 |
dataset_name = [dataset_name]
270 |
271 |
dataset = get_detection_dataset_dicts(
272 |
273 |
274 |
275 |
276 |
for x in dataset_name
277 |
278 |
279 |
else None,
280 |
281 |
if mapper is None:
282 |
mapper = DatasetMapper(cfg, False)
283 |
return {
284 |
"dataset": dataset,
285 |
"mapper": mapper,
286 |
"num_workers": 0,
287 |
"samples_per_gpu": cfg.SOLVER.TEST_IMS_PER_BATCH,
288 |
289 |
290 |
291 |
292 |
def build_detection_test_loader(
293 |
dataset, *, mapper, sampler=None, num_workers=0, samples_per_gpu=1
294 |
295 |
296 |
Similar to `build_detection_train_loader`, but uses a batch size of 1,
297 |
and :class:`InferenceSampler`. This sampler coordinates all workers to
298 |
produce the exact set of all samples.
299 |
This interface is experimental.
300 |
301 |
302 |
dataset (list or a list of dataset dicts,
303 |
or a map-style pytorch dataset. They can be obtained by using
304 |
:func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
305 |
mapper (callable): a callable which takes a sample (dict) from dataset
306 |
and returns the format to be consumed by the model.
307 |
When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
308 |
sampler ( or None): a sampler that produces
309 |
indices to be applied on ``dataset``. Default to :class:`InferenceSampler`,
310 |
which splits the dataset across all workers.
311 |
num_workers (int): number of parallel data loading workers
312 |
313 |
314 |
DataLoader: a torch DataLoader, that loads the given detection
315 |
dataset, with test-time transformation and batching.
316 |
317 |
318 |
319 |
data_loader = build_detection_test_loader(
320 |
321 |
322 |
323 |
# or, instantiate with a CfgNode:
324 |
data_loader = build_detection_test_loader(cfg, "my_test")
325 |
326 |
if isinstance(dataset, list):
327 |
dataset = DatasetFromList(dataset, copy=False)
328 |
if mapper is not None:
329 |
dataset = MapDataset(dataset, mapper)
330 |
if sampler is None:
331 |
sampler = InferenceSampler(len(dataset))
332 |
# Always use 1 image per worker during inference since this is the
333 |
# standard when reporting inference time in papers.
334 |
batch_sampler =
335 |
sampler, samples_per_gpu, drop_last=False
336 |
337 |
data_loader =
338 |
339 |
340 |
341 |
342 |
343 |
return data_loader
344 |
@@ -0,0 +1,4 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
@@ -0,0 +1,208 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
import copy
5 |
import logging
6 |
7 |
import numpy as np
8 |
import torch
9 |
from torch.nn import functional as F
10 |
11 |
from detectron2.config import configurable
12 |
from import MetadataCatalog
13 |
from import detection_utils as utils
14 |
from import transforms as T
15 |
from detectron2.projects.point_rend import ColorAugSSDTransform
16 |
from detectron2.structures import BitMasks, Instances
17 |
18 |
__all__ = ["MaskFormerSemanticDatasetMapper"]
19 |
20 |
21 |
class MaskFormerSemanticDatasetMapper:
22 |
23 |
A callable which takes a dataset dict in Detectron2 Dataset format,
24 |
and map it into a format used by MaskFormer for semantic segmentation.
25 |
26 |
The callable currently does the following:
27 |
28 |
1. Read the image from "file_name"
29 |
2. Applies geometric transforms to the image and annotation
30 |
3. Find and applies suitable cropping to the image and annotation
31 |
4. Prepare image and annotation to Tensors
32 |
33 |
34 |
35 |
def __init__(
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
NOTE: this interface is experimental.
46 |
47 |
is_train: for training or inference
48 |
augmentations: a list of augmentations or deterministic transforms to apply
49 |
image_format: an image format supported by :func:`detection_utils.read_image`.
50 |
ignore_label: the label that is ignored to evaluation
51 |
size_divisibility: pad image size to be divisible by this value
52 |
53 |
self.is_train = is_train
54 |
self.tfm_gens = augmentations
55 |
self.img_format = image_format
56 |
self.ignore_label = ignore_label
57 |
self.size_divisibility = size_divisibility
58 |
59 |
logger = logging.getLogger(__name__)
60 |
mode = "training" if is_train else "inference"
61 |
62 |
f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}"
63 |
64 |
65 |
66 |
def from_config(cls, cfg, is_train=True):
67 |
# Build augmentation
68 |
if is_train:
69 |
augs = [
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
# Assume always applies to the training set.
90 |
dataset_names = cfg.DATASETS.TRAIN
91 |
92 |
min_size = cfg.INPUT.MIN_SIZE_TEST
93 |
max_size = cfg.INPUT.MAX_SIZE_TEST
94 |
sample_style = "choice"
95 |
augs = [T.ResizeShortestEdge(min_size, max_size, sample_style)]
96 |
dataset_names = cfg.DATASETS.TEST
97 |
meta = MetadataCatalog.get(dataset_names[0])
98 |
ignore_label = meta.ignore_label
99 |
100 |
ret = {
101 |
"is_train": is_train,
102 |
"augmentations": augs,
103 |
"image_format": cfg.INPUT.FORMAT,
104 |
"ignore_label": ignore_label,
105 |
"size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY if is_train else -1,
106 |
107 |
return ret
108 |
109 |
def __call__(self, dataset_dict):
110 |
111 |
112 |
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
113 |
114 |
115 |
dict: a format that builtin models in detectron2 accept
116 |
117 |
# assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!"
118 |
119 |
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
120 |
image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
121 |
utils.check_image_size(dataset_dict, image)
122 |
123 |
if "sem_seg_file_name" in dataset_dict:
124 |
# PyTorch transformation not implemented for uint16, so converting it to double first
125 |
sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype(
126 |
127 |
128 |
129 |
sem_seg_gt = None
130 |
131 |
if sem_seg_gt is None:
132 |
raise ValueError(
133 |
"Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
134 |
135 |
136 |
137 |
138 |
aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
139 |
aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
140 |
image = aug_input.image
141 |
sem_seg_gt = aug_input.sem_seg
142 |
143 |
# Pad image and segmentation label here!
144 |
image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
145 |
if sem_seg_gt is not None:
146 |
sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
147 |
148 |
if self.size_divisibility > 0:
149 |
image_size = (image.shape[-2], image.shape[-1])
150 |
padding_size = [
151 |
152 |
self.size_divisibility - image_size[1],
153 |
154 |
self.size_divisibility - image_size[0],
155 |
156 |
image = F.pad(image, padding_size, value=128).contiguous()
157 |
if sem_seg_gt is not None:
158 |
sem_seg_gt = F.pad(
159 |
sem_seg_gt, padding_size, value=self.ignore_label
160 |
161 |
162 |
image_shape = (image.shape[-2], image.shape[-1]) # h, w
163 |
164 |
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
165 |
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
166 |
# Therefore it's important to use torch.Tensor.
167 |
dataset_dict["image"] = image
168 |
169 |
if sem_seg_gt is not None:
170 |
dataset_dict["sem_seg"] = sem_seg_gt.long()
171 |
172 |
if "annotations" in dataset_dict:
173 |
raise ValueError(
174 |
"Semantic segmentation dataset should not have 'annotations'."
175 |
176 |
177 |
# Prepare per-category binary masks
178 |
if sem_seg_gt is not None:
179 |
sem_seg_gt = sem_seg_gt.numpy()
180 |
instances = Instances(image_shape)
181 |
classes = np.unique(sem_seg_gt)
182 |
# remove ignored region
183 |
classes = classes[classes != self.ignore_label]
184 |
instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
185 |
186 |
masks = []
187 |
for class_id in classes:
188 |
masks.append(sem_seg_gt == class_id)
189 |
190 |
if len(masks) == 0:
191 |
# Some image does not have annotation (all ignored)
192 |
instances.gt_masks = torch.zeros(
193 |
(0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1])
194 |
195 |
196 |
masks = BitMasks(
197 |
198 |
199 |
200 |
for x in masks
201 |
202 |
203 |
204 |
instances.gt_masks = masks.tensor
205 |
206 |
dataset_dict["instances"] = instances
207 |
208 |
return dataset_dict
@@ -0,0 +1,5 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
from . import register_coco_stuff, register_voc_seg
3 |
from . import register_cc3m
4 |
from . import register_ade20k_full
5 |
from . import register_pascal_context
@@ -0,0 +1,459 @@
1 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
2 |
import ast
3 |
import json
4 |
import logging
5 |
import math
6 |
import os
7 |
import random
8 |
import sys
9 |
import time
10 |
from dataclasses import dataclass
11 |
from multiprocessing import Value
12 |
13 |
import braceexpand
14 |
import numpy as np
15 |
import pandas as pd
16 |
import torch
17 |
import torchvision.datasets as datasets
18 |
import webdataset as wds
19 |
from PIL import Image
20 |
from import Dataset, DataLoader, SubsetRandomSampler, IterableDataset, get_worker_info
21 |
from import DistributedSampler
22 |
from webdataset.filters import _shuffle
23 |
from webdataset.tariterators import base_plus_ext, url_opener, tar_file_expander, valid_sample
24 |
25 |
26 |
import horovod.torch as hvd
27 |
except ImportError:
28 |
hvd = None
29 |
30 |
from clip import tokenize
31 |
32 |
33 |
class CsvDataset(Dataset):
34 |
def __init__(self, input_filename, transforms, img_key, caption_key, sep="\t"):
35 |
logging.debug(f'Loading csv data from {input_filename}.')
36 |
df = pd.read_csv(input_filename, sep=sep)
37 |
38 |
self.images = df[img_key].tolist()
39 |
self.captions = df[caption_key].tolist()
40 |
self.transforms = transforms
41 |
logging.debug('Done loading data.')
42 |
43 |
def __len__(self):
44 |
return len(self.captions)
45 |
46 |
def __getitem__(self, idx):
47 |
images = self.transforms([idx])))
48 |
texts = tokenize([str(self.captions[idx])])[0]
49 |
return images, texts
50 |
51 |
52 |
class SharedEpoch:
53 |
def __init__(self, epoch: int = 0):
54 |
self.shared_epoch = Value('i', epoch)
55 |
56 |
def set_value(self, epoch):
57 |
self.shared_epoch.value = epoch
58 |
59 |
def get_value(self):
60 |
return self.shared_epoch.value
61 |
62 |
63 |
64 |
class DataInfo:
65 |
dataloader: DataLoader
66 |
sampler: DistributedSampler = None
67 |
shared_epoch: SharedEpoch = None
68 |
69 |
def set_epoch(self, epoch):
70 |
if self.shared_epoch is not None:
71 |
72 |
if self.sampler is not None and isinstance(self.sampler, DistributedSampler):
73 |
74 |
75 |
76 |
def preprocess_txt(text):
77 |
return tokenize([str(text)])[0]
78 |
79 |
80 |
def get_dataset_size(shards):
81 |
shards_list = list(braceexpand.braceexpand(shards))
82 |
dir_path = os.path.dirname(shards)
83 |
sizes_filename = os.path.join(dir_path, 'sizes.json')
84 |
len_filename = os.path.join(dir_path, '__len__')
85 |
if os.path.exists(sizes_filename):
86 |
sizes = json.load(open(sizes_filename, 'r'))
87 |
total_size = sum([int(sizes[os.path.basename(shard)]) for shard in shards_list])
88 |
elif os.path.exists(len_filename):
89 |
# FIXME this used to be eval(open(...)) but that seemed rather unsafe
90 |
total_size = ast.literal_eval(open(len_filename, 'r').read())
91 |
92 |
total_size = None # num samples undefined
93 |
# some common dataset sizes (at time of authors last download)
94 |
# CC3M (train): 2905954
95 |
# CC12M: 10968539
96 |
# LAION-400M: 407332084
97 |
# LAION-2B (english): 2170337258
98 |
num_shards = len(shards_list)
99 |
return total_size, num_shards
100 |
101 |
102 |
def get_imagenet(args, preprocess_fns, split):
103 |
assert split in ["train", "val", "v2"]
104 |
is_train = split == "train"
105 |
preprocess_train, preprocess_val = preprocess_fns
106 |
107 |
if split == "v2":
108 |
from imagenetv2_pytorch import ImageNetV2Dataset
109 |
dataset = ImageNetV2Dataset(location=args.imagenet_v2, transform=preprocess_val)
110 |
111 |
if is_train:
112 |
data_path = args.imagenet_train
113 |
preprocess_fn = preprocess_train
114 |
115 |
data_path = args.imagenet_val
116 |
preprocess_fn = preprocess_val
117 |
assert data_path
118 |
119 |
dataset = datasets.ImageFolder(data_path, transform=preprocess_fn)
120 |
121 |
if is_train:
122 |
idxs = np.zeros(len(dataset.targets))
123 |
target_array = np.array(dataset.targets)
124 |
k = 50
125 |
for c in range(1000):
126 |
m = target_array == c
127 |
n = len(idxs[m])
128 |
arr = np.zeros(n)
129 |
arr[:k] = 1
130 |
131 |
idxs[m] = arr
132 |
133 |
idxs = idxs.astype('int')
134 |
sampler = SubsetRandomSampler(np.where(idxs)[0])
135 |
136 |
sampler = None
137 |
138 |
dataloader =
139 |
140 |
141 |
142 |
143 |
144 |
145 |
return DataInfo(dataloader=dataloader, sampler=sampler)
146 |
147 |
148 |
def count_samples(dataloader):
149 |
os.environ["WDS_EPOCH"] = "0"
150 |
n_elements, n_batches = 0, 0
151 |
for images, texts in dataloader:
152 |
n_batches += 1
153 |
n_elements += len(images)
154 |
assert len(images) == len(texts)
155 |
return n_elements, n_batches
156 |
157 |
158 |
def filter_no_caption(sample):
159 |
return 'txt' in sample
160 |
161 |
162 |
def log_and_continue(exn):
163 |
"""Call in an exception handler to ignore any exception, isssue a warning, and continue."""
164 |
logging.warning(f'Handling webdataset error ({repr(exn)}). Ignoring.')
165 |
return True
166 |
167 |
168 |
def group_by_keys_nothrow(data, keys=base_plus_ext, lcase=True, suffixes=None, handler=None):
169 |
"""Return function over iterator that groups key, value pairs into samples.
170 |
171 |
:param keys: function that splits the key into key and extension (base_plus_ext)
172 |
:param lcase: convert suffixes to lower case (Default value = True)
173 |
174 |
current_sample = None
175 |
for filesample in data:
176 |
assert isinstance(filesample, dict)
177 |
fname, value = filesample["fname"], filesample["data"]
178 |
prefix, suffix = keys(fname)
179 |
if prefix is None:
180 |
181 |
if lcase:
182 |
suffix = suffix.lower()
183 |
# FIXME webdataset version throws if suffix in current_sample, but we have a potential for
184 |
# this happening in the current LAION400m dataset if a tar ends with same prefix as the next
185 |
# begins, rare, but can happen since prefix aren't unique across tar files in that dataset
186 |
if current_sample is None or prefix != current_sample["__key__"] or suffix in current_sample:
187 |
if valid_sample(current_sample):
188 |
yield current_sample
189 |
current_sample = dict(__key__=prefix, __url__=filesample["__url__"])
190 |
if suffixes is None or suffix in suffixes:
191 |
current_sample[suffix] = value
192 |
if valid_sample(current_sample):
193 |
yield current_sample
194 |
195 |
196 |
def tarfile_to_samples_nothrow(src, handler=log_and_continue):
197 |
# NOTE this is a re-impl of the webdataset impl with group_by_keys that doesn't throw
198 |
streams = url_opener(src, handler=handler)
199 |
files = tar_file_expander(streams, handler=handler)
200 |
samples = group_by_keys_nothrow(files, handler=handler)
201 |
return samples
202 |
203 |
204 |
def pytorch_worker_seed():
205 |
"""get dataloader worker seed from pytorch"""
206 |
worker_info = get_worker_info()
207 |
if worker_info is not None:
208 |
# favour the seed already created for pytorch dataloader workers if it exists
209 |
return worker_info.seed
210 |
# fallback to wds rank based seed
211 |
return wds.utils.pytorch_worker_seed()
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
class detshuffle2(wds.PipelineStage):
221 |
def __init__(
222 |
223 |
224 |
225 |
226 |
227 |
228 |
self.bufsize = bufsize
229 |
self.initial = initial
230 |
self.seed = seed
231 |
self.epoch = epoch
232 |
233 |
def run(self, src):
234 |
if isinstance(self.epoch, SharedEpoch):
235 |
epoch = self.epoch.get_value()
236 |
237 |
# NOTE: this is epoch tracking is problematic in a multiprocess (dataloader workers or train)
238 |
# situation as different workers may wrap at different times (or not at all).
239 |
self.epoch += 1
240 |
epoch = self.epoch
241 |
rng = random.Random()
242 |
if self.seed < 0:
243 |
seed = pytorch_worker_seed() + epoch
244 |
245 |
seed = self.seed + epoch
246 |
247 |
return _shuffle(src, self.bufsize, self.initial, rng)
248 |
249 |
250 |
class ResampledShards2(IterableDataset):
251 |
"""An iterable dataset yielding a list of urls."""
252 |
253 |
def __init__(
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
"""Sample shards from the shard list with replacement.
262 |
263 |
:param urls: a list of URLs as a Python list or brace notation string
264 |
265 |
266 |
urls = wds.shardlists.expand_urls(urls)
267 |
self.urls = urls
268 |
assert isinstance(self.urls[0], str)
269 |
self.nshards = nshards
270 |
self.rng = random.Random()
271 |
self.worker_seed = pytorch_worker_seed if worker_seed is None else worker_seed
272 |
self.deterministic = deterministic
273 |
self.epoch = epoch
274 |
275 |
def __iter__(self):
276 |
"""Return an iterator over the shards."""
277 |
if isinstance(self.epoch, SharedEpoch):
278 |
epoch = self.epoch.get_value()
279 |
280 |
# NOTE: this is epoch tracking is problematic in a multiprocess (dataloader workers or train)
281 |
# situation as different workers may wrap at different times (or not at all).
282 |
self.epoch += 1
283 |
epoch = self.epoch
284 |
if self.deterministic:
285 |
# reset seed w/ epoch if deterministic, worker seed should be deterministic due to arg.seed
286 |
self.rng.seed(self.worker_seed() + epoch)
287 |
for _ in range(self.nshards):
288 |
yield dict(url=self.rng.choice(self.urls))
289 |
290 |
291 |
def get_wds_dataset(args, preprocess_img, is_train, epoch=0, floor=False):
292 |
input_shards = args.train_data if is_train else args.val_data
293 |
assert input_shards is not None
294 |
resampled = getattr(args, 'dataset_resampled', False) and is_train
295 |
296 |
num_samples, num_shards = get_dataset_size(input_shards)
297 |
if not num_samples:
298 |
if is_train:
299 |
num_samples = args.train_num_samples
300 |
if not num_samples:
301 |
raise RuntimeError(
302 |
'Currently, number of dataset samples must be specified for training dataset. '
303 |
'Please specify via `--train-num-samples` if no dataset length info present.')
304 |
305 |
num_samples = args.val_num_samples or 0 # eval will just exhaust the iterator if not specified
306 |
307 |
shared_epoch = SharedEpoch(epoch=epoch) # create a shared epoch store to sync epoch to dataloader worker proc
308 |
if resampled:
309 |
pipeline = [ResampledShards2(input_shards, deterministic=True, epoch=shared_epoch)]
310 |
311 |
pipeline = [wds.SimpleShardList(input_shards)]
312 |
313 |
# at this point we have an iterator over all the shards
314 |
if is_train:
315 |
if not resampled:
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
# at this point, we have an iterator over the shards assigned to each worker at each node
328 |
tarfile_to_samples_nothrow, # wds.tarfile_to_samples(handler=log_and_continue),
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
# at this point, we have an iterator over the shards assigned to each worker
338 |
339 |
340 |
341 |
342 |
wds.decode("pilrgb", handler=log_and_continue),
343 |
wds.rename(image="jpg;png", text="txt"),
344 |
wds.map_dict(image=preprocess_img, text=preprocess_txt),
345 |
wds.to_tuple("image", "text"),
346 |
wds.batched(args.batch_size, partial=not is_train),
347 |
348 |
349 |
dataset = wds.DataPipeline(*pipeline)
350 |
if is_train:
351 |
if not resampled:
352 |
assert num_shards >= args.workers * args.world_size, 'number of shards must be >= total workers'
353 |
# roll over and repeat a few samples to get same number of full batches on each node
354 |
round_fn = math.floor if floor else math.ceil
355 |
global_batch_size = args.batch_size * args.world_size
356 |
num_batches = round_fn(num_samples / global_batch_size)
357 |
num_workers = max(1, args.workers)
358 |
num_worker_batches = round_fn(num_batches / num_workers) # per dataloader worker
359 |
num_batches = num_worker_batches * num_workers
360 |
num_samples = num_batches * global_batch_size
361 |
dataset = dataset.with_epoch(num_worker_batches) # each worker is iterating over this
362 |
363 |
# last batches are partial, eval is done on single (master) node
364 |
num_batches = math.ceil(num_samples / args.batch_size)
365 |
366 |
dataloader = wds.WebLoader(
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
# FIXME not clear which approach is better, with_epoch before vs after dataloader?
375 |
# hoping to resolve via
376 |
# if is_train:
377 |
# # roll over and repeat a few samples to get same number of full batches on each node
378 |
# global_batch_size = args.batch_size * args.world_size
379 |
# num_batches = math.ceil(num_samples / global_batch_size)
380 |
# num_workers = max(1, args.workers)
381 |
# num_batches = math.ceil(num_batches / num_workers) * num_workers
382 |
# num_samples = num_batches * global_batch_size
383 |
# dataloader = dataloader.with_epoch(num_batches)
384 |
# else:
385 |
# # last batches are partial, eval is done on single (master) node
386 |
# num_batches = math.ceil(num_samples / args.batch_size)
387 |
388 |
# add meta-data to dataloader instance for convenience
389 |
dataloader.num_batches = num_batches
390 |
dataloader.num_samples = num_samples
391 |
392 |
return DataInfo(dataloader=dataloader, shared_epoch=shared_epoch)
393 |
394 |
395 |
def get_csv_dataset(args, preprocess_fn, is_train, epoch=0):
396 |
input_filename = args.train_data if is_train else args.val_data
397 |
assert input_filename
398 |
dataset = CsvDataset(
399 |
400 |
401 |
402 |
403 |
404 |
num_samples = len(dataset)
405 |
sampler = DistributedSampler(dataset) if args.distributed and is_train else None
406 |
shuffle = is_train and sampler is None
407 |
408 |
dataloader = DataLoader(
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
dataloader.num_samples = num_samples
418 |
dataloader.num_batches = len(dataloader)
419 |
420 |
return DataInfo(dataloader, sampler)
421 |
422 |
423 |
def get_dataset_fn(data_path, dataset_type):
424 |
if dataset_type == "webdataset":
425 |
return get_wds_dataset
426 |
elif dataset_type == "csv":
427 |
return get_csv_dataset
428 |
elif dataset_type == "auto":
429 |
ext = data_path.split('.')[-1]
430 |
if ext in ['csv', 'tsv']:
431 |
return get_csv_dataset
432 |
elif ext in ['tar']:
433 |
return get_wds_dataset
434 |
435 |
raise ValueError(
436 |
f"Tried to figure out dataset type, but failed for extention {ext}.")
437 |
438 |
raise ValueError(f"Unsupported dataset type: {dataset_type}")
439 |
440 |
441 |
def get_data(args, preprocess_fns, epoch=0):
442 |
preprocess_train, preprocess_val = preprocess_fns
443 |
data = {}
444 |
445 |
if args.train_data:
446 |
data["train"] = get_dataset_fn(args.train_data, args.dataset_type)(
447 |
args, preprocess_train, is_train=True, epoch=epoch)
448 |
449 |
if args.val_data:
450 |
data["val"] = get_dataset_fn(args.val_data, args.dataset_type)(
451 |
args, preprocess_val, is_train=False)
452 |
453 |
if args.imagenet_val is not None:
454 |
data["imagenet-val"] = get_imagenet(args, preprocess_fns, "val")
455 |
456 |
if args.imagenet_v2 is not None:
457 |
data["imagenet-v2"] = get_imagenet(args, preprocess_fns, "v2")
458 |
459 |
return data
@@ -0,0 +1,995 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
import os
3 |
4 |
from import DatasetCatalog, MetadataCatalog
5 |
from import load_sem_seg
6 |
7 |
8 |
{"name": "wall", "id": 2978, "trainId": 0},
9 |
{"name": "building, edifice", "id": 312, "trainId": 1},
10 |
{"name": "sky", "id": 2420, "trainId": 2},
11 |
{"name": "tree", "id": 2855, "trainId": 3},
12 |
{"name": "road, route", "id": 2131, "trainId": 4},
13 |
{"name": "floor, flooring", "id": 976, "trainId": 5},
14 |
{"name": "ceiling", "id": 447, "trainId": 6},
15 |
{"name": "bed", "id": 165, "trainId": 7},
16 |
{"name": "sidewalk, pavement", "id": 2377, "trainId": 8},
17 |
{"name": "earth, ground", "id": 838, "trainId": 9},
18 |
{"name": "cabinet", "id": 350, "trainId": 10},
19 |
20 |
"name": "person, individual, someone, somebody, mortal, soul",
21 |
"id": 1831,
22 |
"trainId": 11,
23 |
24 |
{"name": "grass", "id": 1125, "trainId": 12},
25 |
{"name": "windowpane, window", "id": 3055, "trainId": 13},
26 |
{"name": "car, auto, automobile, machine, motorcar", "id": 401, "trainId": 14},
27 |
{"name": "mountain, mount", "id": 1610, "trainId": 15},
28 |
{"name": "plant, flora, plant life", "id": 1910, "trainId": 16},
29 |
{"name": "table", "id": 2684, "trainId": 17},
30 |
{"name": "chair", "id": 471, "trainId": 18},
31 |
{"name": "curtain, drape, drapery, mantle, pall", "id": 687, "trainId": 19},
32 |
{"name": "door", "id": 774, "trainId": 20},
33 |
{"name": "sofa, couch, lounge", "id": 2473, "trainId": 21},
34 |
{"name": "sea", "id": 2264, "trainId": 22},
35 |
{"name": "painting, picture", "id": 1735, "trainId": 23},
36 |
{"name": "water", "id": 2994, "trainId": 24},
37 |
{"name": "mirror", "id": 1564, "trainId": 25},
38 |
{"name": "house", "id": 1276, "trainId": 26},
39 |
{"name": "rug, carpet, carpeting", "id": 2178, "trainId": 27},
40 |
{"name": "shelf", "id": 2329, "trainId": 28},
41 |
{"name": "armchair", "id": 57, "trainId": 29},
42 |
{"name": "fence, fencing", "id": 907, "trainId": 30},
43 |
{"name": "field", "id": 913, "trainId": 31},
44 |
{"name": "lamp", "id": 1395, "trainId": 32},
45 |
{"name": "rock, stone", "id": 2138, "trainId": 33},
46 |
{"name": "seat", "id": 2272, "trainId": 34},
47 |
{"name": "river", "id": 2128, "trainId": 35},
48 |
{"name": "desk", "id": 724, "trainId": 36},
49 |
{"name": "bathtub, bathing tub, bath, tub", "id": 155, "trainId": 37},
50 |
{"name": "railing, rail", "id": 2053, "trainId": 38},
51 |
{"name": "signboard, sign", "id": 2380, "trainId": 39},
52 |
{"name": "cushion", "id": 689, "trainId": 40},
53 |
{"name": "path", "id": 1788, "trainId": 41},
54 |
{"name": "work surface", "id": 3087, "trainId": 42},
55 |
{"name": "stairs, steps", "id": 2530, "trainId": 43},
56 |
{"name": "column, pillar", "id": 581, "trainId": 44},
57 |
{"name": "sink", "id": 2388, "trainId": 45},
58 |
{"name": "wardrobe, closet, press", "id": 2985, "trainId": 46},
59 |
{"name": "snow", "id": 2454, "trainId": 47},
60 |
{"name": "refrigerator, icebox", "id": 2096, "trainId": 48},
61 |
{"name": "base, pedestal, stand", "id": 137, "trainId": 49},
62 |
{"name": "bridge, span", "id": 294, "trainId": 50},
63 |
{"name": "blind, screen", "id": 212, "trainId": 51},
64 |
{"name": "runway", "id": 2185, "trainId": 52},
65 |
{"name": "cliff, drop, drop-off", "id": 524, "trainId": 53},
66 |
{"name": "sand", "id": 2212, "trainId": 54},
67 |
{"name": "fireplace, hearth, open fireplace", "id": 943, "trainId": 55},
68 |
{"name": "pillow", "id": 1869, "trainId": 56},
69 |
{"name": "screen door, screen", "id": 2251, "trainId": 57},
70 |
71 |
"name": "toilet, can, commode, crapper, pot, potty, stool, throne",
72 |
"id": 2793,
73 |
"trainId": 58,
74 |
75 |
{"name": "skyscraper", "id": 2423, "trainId": 59},
76 |
{"name": "grandstand, covered stand", "id": 1121, "trainId": 60},
77 |
{"name": "box", "id": 266, "trainId": 61},
78 |
{"name": "pool table, billiard table, snooker table", "id": 1948, "trainId": 62},
79 |
{"name": "palm, palm tree", "id": 1744, "trainId": 63},
80 |
{"name": "double door", "id": 783, "trainId": 64},
81 |
{"name": "coffee table, cocktail table", "id": 571, "trainId": 65},
82 |
{"name": "counter", "id": 627, "trainId": 66},
83 |
{"name": "countertop", "id": 629, "trainId": 67},
84 |
{"name": "chest of drawers, chest, bureau, dresser", "id": 491, "trainId": 68},
85 |
{"name": "kitchen island", "id": 1374, "trainId": 69},
86 |
{"name": "boat", "id": 223, "trainId": 70},
87 |
{"name": "waterfall, falls", "id": 3016, "trainId": 71},
88 |
89 |
"name": "stove, kitchen stove, range, kitchen range, cooking stove",
90 |
"id": 2598,
91 |
"trainId": 72,
92 |
93 |
{"name": "flower", "id": 978, "trainId": 73},
94 |
{"name": "bookcase", "id": 239, "trainId": 74},
95 |
{"name": "controls", "id": 608, "trainId": 75},
96 |
{"name": "book", "id": 236, "trainId": 76},
97 |
{"name": "stairway, staircase", "id": 2531, "trainId": 77},
98 |
{"name": "streetlight, street lamp", "id": 2616, "trainId": 78},
99 |
100 |
"name": "computer, computing machine, computing device, data processor, electronic computer, information processing system",
101 |
"id": 591,
102 |
"trainId": 79,
103 |
104 |
105 |
"name": "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, motorcoach, omnibus, passenger vehicle",
106 |
"id": 327,
107 |
"trainId": 80,
108 |
109 |
{"name": "swivel chair", "id": 2679, "trainId": 81},
110 |
{"name": "light, light source", "id": 1451, "trainId": 82},
111 |
{"name": "bench", "id": 181, "trainId": 83},
112 |
{"name": "case, display case, showcase, vitrine", "id": 420, "trainId": 84},
113 |
{"name": "towel", "id": 2821, "trainId": 85},
114 |
{"name": "fountain", "id": 1023, "trainId": 86},
115 |
{"name": "embankment", "id": 855, "trainId": 87},
116 |
117 |
"name": "television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box",
118 |
"id": 2733,
119 |
"trainId": 88,
120 |
121 |
{"name": "van", "id": 2928, "trainId": 89},
122 |
{"name": "hill", "id": 1240, "trainId": 90},
123 |
{"name": "awning, sunshade, sunblind", "id": 77, "trainId": 91},
124 |
{"name": "poster, posting, placard, notice, bill, card", "id": 1969, "trainId": 92},
125 |
{"name": "truck, motortruck", "id": 2880, "trainId": 93},
126 |
{"name": "airplane, aeroplane, plane", "id": 14, "trainId": 94},
127 |
{"name": "pole", "id": 1936, "trainId": 95},
128 |
{"name": "tower", "id": 2828, "trainId": 96},
129 |
{"name": "court", "id": 631, "trainId": 97},
130 |
{"name": "ball", "id": 103, "trainId": 98},
131 |
132 |
"name": "aircraft carrier, carrier, flattop, attack aircraft carrier",
133 |
"id": 3144,
134 |
"trainId": 99,
135 |
136 |
{"name": "buffet, counter, sideboard", "id": 308, "trainId": 100},
137 |
{"name": "hovel, hut, hutch, shack, shanty", "id": 1282, "trainId": 101},
138 |
{"name": "apparel, wearing apparel, dress, clothes", "id": 38, "trainId": 102},
139 |
{"name": "minibike, motorbike", "id": 1563, "trainId": 103},
140 |
141 |
"name": "animal, animate being, beast, brute, creature, fauna",
142 |
"id": 29,
143 |
"trainId": 104,
144 |
145 |
{"name": "chandelier, pendant, pendent", "id": 480, "trainId": 105},
146 |
{"name": "step, stair", "id": 2569, "trainId": 106},
147 |
{"name": "booth, cubicle, stall, kiosk", "id": 247, "trainId": 107},
148 |
{"name": "bicycle, bike, wheel, cycle", "id": 187, "trainId": 108},
149 |
{"name": "doorframe, doorcase", "id": 778, "trainId": 109},
150 |
{"name": "sconce", "id": 2243, "trainId": 110},
151 |
{"name": "pond", "id": 1941, "trainId": 111},
152 |
{"name": "trade name, brand name, brand, marque", "id": 2833, "trainId": 112},
153 |
154 |
"name": "bannister, banister, balustrade, balusters, handrail",
155 |
"id": 120,
156 |
"trainId": 113,
157 |
158 |
{"name": "bag", "id": 95, "trainId": 114},
159 |
{"name": "traffic light, traffic signal, stoplight", "id": 2836, "trainId": 115},
160 |
{"name": "gazebo", "id": 1087, "trainId": 116},
161 |
{"name": "escalator, moving staircase, moving stairway", "id": 868, "trainId": 117},
162 |
{"name": "land, ground, soil", "id": 1401, "trainId": 118},
163 |
{"name": "board, plank", "id": 220, "trainId": 119},
164 |
{"name": "arcade machine", "id": 47, "trainId": 120},
165 |
{"name": "eiderdown, duvet, continental quilt", "id": 843, "trainId": 121},
166 |
{"name": "bar", "id": 123, "trainId": 122},
167 |
{"name": "stall, stand, sales booth", "id": 2537, "trainId": 123},
168 |
{"name": "playground", "id": 1927, "trainId": 124},
169 |
{"name": "ship", "id": 2337, "trainId": 125},
170 |
{"name": "ottoman, pouf, pouffe, puff, hassock", "id": 1702, "trainId": 126},
171 |
172 |
"name": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
173 |
"id": 64,
174 |
"trainId": 127,
175 |
176 |
{"name": "bottle", "id": 249, "trainId": 128},
177 |
{"name": "cradle", "id": 642, "trainId": 129},
178 |
{"name": "pot, flowerpot", "id": 1981, "trainId": 130},
179 |
180 |
"name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
181 |
"id": 609,
182 |
"trainId": 131,
183 |
184 |
{"name": "train, railroad train", "id": 2840, "trainId": 132},
185 |
{"name": "stool", "id": 2586, "trainId": 133},
186 |
{"name": "lake", "id": 1393, "trainId": 134},
187 |
{"name": "tank, storage tank", "id": 2704, "trainId": 135},
188 |
{"name": "ice, water ice", "id": 1304, "trainId": 136},
189 |
{"name": "basket, handbasket", "id": 146, "trainId": 137},
190 |
{"name": "manhole", "id": 1494, "trainId": 138},
191 |
{"name": "tent, collapsible shelter", "id": 2739, "trainId": 139},
192 |
{"name": "canopy", "id": 389, "trainId": 140},
193 |
{"name": "microwave, microwave oven", "id": 1551, "trainId": 141},
194 |
{"name": "barrel, cask", "id": 131, "trainId": 142},
195 |
{"name": "dirt track", "id": 738, "trainId": 143},
196 |
{"name": "beam", "id": 161, "trainId": 144},
197 |
{"name": "dishwasher, dish washer, dishwashing machine", "id": 747, "trainId": 145},
198 |
{"name": "plate", "id": 1919, "trainId": 146},
199 |
{"name": "screen, crt screen", "id": 3109, "trainId": 147},
200 |
{"name": "ruins", "id": 2179, "trainId": 148},
201 |
{"name": "washer, automatic washer, washing machine", "id": 2989, "trainId": 149},
202 |
{"name": "blanket, cover", "id": 206, "trainId": 150},
203 |
{"name": "plaything, toy", "id": 1930, "trainId": 151},
204 |
{"name": "food, solid food", "id": 1002, "trainId": 152},
205 |
{"name": "screen, silver screen, projection screen", "id": 2254, "trainId": 153},
206 |
{"name": "oven", "id": 1708, "trainId": 154},
207 |
{"name": "stage", "id": 2526, "trainId": 155},
208 |
{"name": "beacon, lighthouse, beacon light, pharos", "id": 160, "trainId": 156},
209 |
{"name": "umbrella", "id": 2901, "trainId": 157},
210 |
{"name": "sculpture", "id": 2262, "trainId": 158},
211 |
{"name": "aqueduct", "id": 44, "trainId": 159},
212 |
{"name": "container", "id": 597, "trainId": 160},
213 |
{"name": "scaffolding, staging", "id": 2235, "trainId": 161},
214 |
{"name": "hood, exhaust hood", "id": 1260, "trainId": 162},
215 |
{"name": "curb, curbing, kerb", "id": 682, "trainId": 163},
216 |
{"name": "roller coaster", "id": 2151, "trainId": 164},
217 |
{"name": "horse, equus caballus", "id": 3107, "trainId": 165},
218 |
{"name": "catwalk", "id": 432, "trainId": 166},
219 |
{"name": "glass, drinking glass", "id": 1098, "trainId": 167},
220 |
{"name": "vase", "id": 2932, "trainId": 168},
221 |
{"name": "central reservation", "id": 461, "trainId": 169},
222 |
{"name": "carousel", "id": 410, "trainId": 170},
223 |
{"name": "radiator", "id": 2046, "trainId": 171},
224 |
{"name": "closet", "id": 533, "trainId": 172},
225 |
{"name": "machine", "id": 1481, "trainId": 173},
226 |
{"name": "pier, wharf, wharfage, dock", "id": 1858, "trainId": 174},
227 |
{"name": "fan", "id": 894, "trainId": 175},
228 |
{"name": "inflatable bounce game", "id": 1322, "trainId": 176},
229 |
{"name": "pitch", "id": 1891, "trainId": 177},
230 |
{"name": "paper", "id": 1756, "trainId": 178},
231 |
{"name": "arcade, colonnade", "id": 49, "trainId": 179},
232 |
{"name": "hot tub", "id": 1272, "trainId": 180},
233 |
{"name": "helicopter", "id": 1229, "trainId": 181},
234 |
{"name": "tray", "id": 2850, "trainId": 182},
235 |
{"name": "partition, divider", "id": 1784, "trainId": 183},
236 |
{"name": "vineyard", "id": 2962, "trainId": 184},
237 |
{"name": "bowl", "id": 259, "trainId": 185},
238 |
{"name": "bullring", "id": 319, "trainId": 186},
239 |
{"name": "flag", "id": 954, "trainId": 187},
240 |
{"name": "pot", "id": 1974, "trainId": 188},
241 |
{"name": "footbridge, overcrossing, pedestrian bridge", "id": 1013, "trainId": 189},
242 |
{"name": "shower", "id": 2356, "trainId": 190},
243 |
244 |
"name": "bag, traveling bag, travelling bag, grip, suitcase",
245 |
"id": 97,
246 |
"trainId": 191,
247 |
248 |
{"name": "bulletin board, notice board", "id": 318, "trainId": 192},
249 |
{"name": "confessional booth", "id": 592, "trainId": 193},
250 |
{"name": "trunk, tree trunk, bole", "id": 2885, "trainId": 194},
251 |
{"name": "forest", "id": 1017, "trainId": 195},
252 |
{"name": "elevator door", "id": 851, "trainId": 196},
253 |
{"name": "laptop, laptop computer", "id": 1407, "trainId": 197},
254 |
{"name": "instrument panel", "id": 1332, "trainId": 198},
255 |
{"name": "bucket, pail", "id": 303, "trainId": 199},
256 |
{"name": "tapestry, tapis", "id": 2714, "trainId": 200},
257 |
{"name": "platform", "id": 1924, "trainId": 201},
258 |
{"name": "jacket", "id": 1346, "trainId": 202},
259 |
{"name": "gate", "id": 1081, "trainId": 203},
260 |
{"name": "monitor, monitoring device", "id": 1583, "trainId": 204},
261 |
262 |
"name": "telephone booth, phone booth, call box, telephone box, telephone kiosk",
263 |
"id": 2727,
264 |
"trainId": 205,
265 |
266 |
{"name": "spotlight, spot", "id": 2509, "trainId": 206},
267 |
{"name": "ring", "id": 2123, "trainId": 207},
268 |
{"name": "control panel", "id": 602, "trainId": 208},
269 |
{"name": "blackboard, chalkboard", "id": 202, "trainId": 209},
270 |
{"name": "air conditioner, air conditioning", "id": 10, "trainId": 210},
271 |
{"name": "chest", "id": 490, "trainId": 211},
272 |
{"name": "clock", "id": 530, "trainId": 212},
273 |
{"name": "sand dune", "id": 2213, "trainId": 213},
274 |
{"name": "pipe, pipage, piping", "id": 1884, "trainId": 214},
275 |
{"name": "vault", "id": 2934, "trainId": 215},
276 |
{"name": "table football", "id": 2687, "trainId": 216},
277 |
{"name": "cannon", "id": 387, "trainId": 217},
278 |
{"name": "swimming pool, swimming bath, natatorium", "id": 2668, "trainId": 218},
279 |
{"name": "fluorescent, fluorescent fixture", "id": 982, "trainId": 219},
280 |
{"name": "statue", "id": 2547, "trainId": 220},
281 |
282 |
"name": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
283 |
"id": 1474,
284 |
"trainId": 221,
285 |
286 |
{"name": "exhibitor", "id": 877, "trainId": 222},
287 |
{"name": "ladder", "id": 1391, "trainId": 223},
288 |
{"name": "carport", "id": 414, "trainId": 224},
289 |
{"name": "dam", "id": 698, "trainId": 225},
290 |
{"name": "pulpit", "id": 2019, "trainId": 226},
291 |
{"name": "skylight, fanlight", "id": 2422, "trainId": 227},
292 |
{"name": "water tower", "id": 3010, "trainId": 228},
293 |
{"name": "grill, grille, grillwork", "id": 1139, "trainId": 229},
294 |
{"name": "display board", "id": 753, "trainId": 230},
295 |
{"name": "pane, pane of glass, window glass", "id": 1747, "trainId": 231},
296 |
{"name": "rubbish, trash, scrap", "id": 2175, "trainId": 232},
297 |
{"name": "ice rink", "id": 1301, "trainId": 233},
298 |
{"name": "fruit", "id": 1033, "trainId": 234},
299 |
{"name": "patio", "id": 1789, "trainId": 235},
300 |
{"name": "vending machine", "id": 2939, "trainId": 236},
301 |
{"name": "telephone, phone, telephone set", "id": 2730, "trainId": 237},
302 |
{"name": "net", "id": 1652, "trainId": 238},
303 |
304 |
"name": "backpack, back pack, knapsack, packsack, rucksack, haversack",
305 |
"id": 90,
306 |
"trainId": 239,
307 |
308 |
{"name": "jar", "id": 1349, "trainId": 240},
309 |
{"name": "track", "id": 2830, "trainId": 241},
310 |
{"name": "magazine", "id": 1485, "trainId": 242},
311 |
{"name": "shutter", "id": 2370, "trainId": 243},
312 |
{"name": "roof", "id": 2155, "trainId": 244},
313 |
{"name": "banner, streamer", "id": 118, "trainId": 245},
314 |
{"name": "landfill", "id": 1402, "trainId": 246},
315 |
{"name": "post", "id": 1957, "trainId": 247},
316 |
{"name": "altarpiece, reredos", "id": 3130, "trainId": 248},
317 |
{"name": "hat, chapeau, lid", "id": 1197, "trainId": 249},
318 |
{"name": "arch, archway", "id": 52, "trainId": 250},
319 |
{"name": "table game", "id": 2688, "trainId": 251},
320 |
{"name": "bag, handbag, pocketbook, purse", "id": 96, "trainId": 252},
321 |
{"name": "document, written document, papers", "id": 762, "trainId": 253},
322 |
{"name": "dome", "id": 772, "trainId": 254},
323 |
{"name": "pier", "id": 1857, "trainId": 255},
324 |
{"name": "shanties", "id": 2315, "trainId": 256},
325 |
{"name": "forecourt", "id": 1016, "trainId": 257},
326 |
{"name": "crane", "id": 643, "trainId": 258},
327 |
{"name": "dog, domestic dog, canis familiaris", "id": 3105, "trainId": 259},
328 |
{"name": "piano, pianoforte, forte-piano", "id": 1849, "trainId": 260},
329 |
{"name": "drawing", "id": 791, "trainId": 261},
330 |
{"name": "cabin", "id": 349, "trainId": 262},
331 |
332 |
"name": "ad, advertisement, advertizement, advertising, advertizing, advert",
333 |
"id": 6,
334 |
"trainId": 263,
335 |
336 |
{"name": "amphitheater, amphitheatre, coliseum", "id": 3114, "trainId": 264},
337 |
{"name": "monument", "id": 1587, "trainId": 265},
338 |
{"name": "henhouse", "id": 1233, "trainId": 266},
339 |
{"name": "cockpit", "id": 559, "trainId": 267},
340 |
{"name": "heater, warmer", "id": 1223, "trainId": 268},
341 |
{"name": "windmill, aerogenerator, wind generator", "id": 3049, "trainId": 269},
342 |
{"name": "pool", "id": 1943, "trainId": 270},
343 |
{"name": "elevator, lift", "id": 853, "trainId": 271},
344 |
{"name": "decoration, ornament, ornamentation", "id": 709, "trainId": 272},
345 |
{"name": "labyrinth", "id": 1390, "trainId": 273},
346 |
{"name": "text, textual matter", "id": 2748, "trainId": 274},
347 |
{"name": "printer", "id": 2007, "trainId": 275},
348 |
{"name": "mezzanine, first balcony", "id": 1546, "trainId": 276},
349 |
{"name": "mattress", "id": 1513, "trainId": 277},
350 |
{"name": "straw", "id": 2600, "trainId": 278},
351 |
{"name": "stalls", "id": 2538, "trainId": 279},
352 |
{"name": "patio, terrace", "id": 1790, "trainId": 280},
353 |
{"name": "billboard, hoarding", "id": 194, "trainId": 281},
354 |
{"name": "bus stop", "id": 326, "trainId": 282},
355 |
{"name": "trouser, pant", "id": 2877, "trainId": 283},
356 |
{"name": "console table, console", "id": 594, "trainId": 284},
357 |
{"name": "rack", "id": 2036, "trainId": 285},
358 |
{"name": "notebook", "id": 1662, "trainId": 286},
359 |
{"name": "shrine", "id": 2366, "trainId": 287},
360 |
{"name": "pantry", "id": 1754, "trainId": 288},
361 |
{"name": "cart", "id": 418, "trainId": 289},
362 |
{"name": "steam shovel", "id": 2553, "trainId": 290},
363 |
{"name": "porch", "id": 1951, "trainId": 291},
364 |
{"name": "postbox, mailbox, letter box", "id": 1963, "trainId": 292},
365 |
{"name": "figurine, statuette", "id": 918, "trainId": 293},
366 |
{"name": "recycling bin", "id": 2086, "trainId": 294},
367 |
{"name": "folding screen", "id": 997, "trainId": 295},
368 |
{"name": "telescope", "id": 2731, "trainId": 296},
369 |
{"name": "deck chair, beach chair", "id": 704, "trainId": 297},
370 |
{"name": "kennel", "id": 1365, "trainId": 298},
371 |
{"name": "coffee maker", "id": 569, "trainId": 299},
372 |
{"name": "altar, communion table, lord's table", "id": 3108, "trainId": 300},
373 |
{"name": "fish", "id": 948, "trainId": 301},
374 |
{"name": "easel", "id": 839, "trainId": 302},
375 |
{"name": "artificial golf green", "id": 63, "trainId": 303},
376 |
{"name": "iceberg", "id": 1305, "trainId": 304},
377 |
{"name": "candlestick, candle holder", "id": 378, "trainId": 305},
378 |
{"name": "shower stall, shower bath", "id": 2362, "trainId": 306},
379 |
{"name": "television stand", "id": 2734, "trainId": 307},
380 |
381 |
"name": "wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle",
382 |
"id": 2982,
383 |
"trainId": 308,
384 |
385 |
{"name": "skeleton", "id": 2398, "trainId": 309},
386 |
{"name": "grand piano, grand", "id": 1119, "trainId": 310},
387 |
{"name": "candy, confect", "id": 382, "trainId": 311},
388 |
{"name": "grille door", "id": 1141, "trainId": 312},
389 |
{"name": "pedestal, plinth, footstall", "id": 1805, "trainId": 313},
390 |
{"name": "jersey, t-shirt, tee shirt", "id": 3102, "trainId": 314},
391 |
{"name": "shoe", "id": 2341, "trainId": 315},
392 |
{"name": "gravestone, headstone, tombstone", "id": 1131, "trainId": 316},
393 |
{"name": "shanty", "id": 2316, "trainId": 317},
394 |
{"name": "structure", "id": 2626, "trainId": 318},
395 |
{"name": "rocking chair, rocker", "id": 3104, "trainId": 319},
396 |
{"name": "bird", "id": 198, "trainId": 320},
397 |
{"name": "place mat", "id": 1896, "trainId": 321},
398 |
{"name": "tomb", "id": 2800, "trainId": 322},
399 |
{"name": "big top", "id": 190, "trainId": 323},
400 |
401 |
"name": "gas pump, gasoline pump, petrol pump, island dispenser",
402 |
"id": 3131,
403 |
"trainId": 324,
404 |
405 |
{"name": "lockers", "id": 1463, "trainId": 325},
406 |
{"name": "cage", "id": 357, "trainId": 326},
407 |
{"name": "finger", "id": 929, "trainId": 327},
408 |
{"name": "bleachers", "id": 209, "trainId": 328},
409 |
{"name": "ferris wheel", "id": 912, "trainId": 329},
410 |
{"name": "hairdresser chair", "id": 1164, "trainId": 330},
411 |
{"name": "mat", "id": 1509, "trainId": 331},
412 |
{"name": "stands", "id": 2539, "trainId": 332},
413 |
{"name": "aquarium, fish tank, marine museum", "id": 3116, "trainId": 333},
414 |
415 |
"name": "streetcar, tram, tramcar, trolley, trolley car",
416 |
"id": 2615,
417 |
"trainId": 334,
418 |
419 |
{"name": "napkin, table napkin, serviette", "id": 1644, "trainId": 335},
420 |
{"name": "dummy", "id": 818, "trainId": 336},
421 |
{"name": "booklet, brochure, folder, leaflet, pamphlet", "id": 242, "trainId": 337},
422 |
{"name": "sand trap", "id": 2217, "trainId": 338},
423 |
{"name": "shop, store", "id": 2347, "trainId": 339},
424 |
{"name": "table cloth", "id": 2686, "trainId": 340},
425 |
{"name": "service station", "id": 2300, "trainId": 341},
426 |
{"name": "coffin", "id": 572, "trainId": 342},
427 |
{"name": "drawer", "id": 789, "trainId": 343},
428 |
{"name": "cages", "id": 358, "trainId": 344},
429 |
{"name": "slot machine, coin machine", "id": 2443, "trainId": 345},
430 |
{"name": "balcony", "id": 101, "trainId": 346},
431 |
{"name": "volleyball court", "id": 2969, "trainId": 347},
432 |
{"name": "table tennis", "id": 2692, "trainId": 348},
433 |
{"name": "control table", "id": 606, "trainId": 349},
434 |
{"name": "shirt", "id": 2339, "trainId": 350},
435 |
{"name": "merchandise, ware, product", "id": 1533, "trainId": 351},
436 |
{"name": "railway", "id": 2060, "trainId": 352},
437 |
{"name": "parterre", "id": 1782, "trainId": 353},
438 |
{"name": "chimney", "id": 495, "trainId": 354},
439 |
{"name": "can, tin, tin can", "id": 371, "trainId": 355},
440 |
{"name": "tanks", "id": 2707, "trainId": 356},
441 |
{"name": "fabric, cloth, material, textile", "id": 889, "trainId": 357},
442 |
{"name": "alga, algae", "id": 3156, "trainId": 358},
443 |
{"name": "system", "id": 2683, "trainId": 359},
444 |
{"name": "map", "id": 1499, "trainId": 360},
445 |
{"name": "greenhouse", "id": 1135, "trainId": 361},
446 |
{"name": "mug", "id": 1619, "trainId": 362},
447 |
{"name": "barbecue", "id": 125, "trainId": 363},
448 |
{"name": "trailer", "id": 2838, "trainId": 364},
449 |
450 |
"name": "toilet tissue, toilet paper, bathroom tissue",
451 |
"id": 2792,
452 |
"trainId": 365,
453 |
454 |
{"name": "organ", "id": 1695, "trainId": 366},
455 |
{"name": "dishrag, dishcloth", "id": 746, "trainId": 367},
456 |
{"name": "island", "id": 1343, "trainId": 368},
457 |
{"name": "keyboard", "id": 1370, "trainId": 369},
458 |
{"name": "trench", "id": 2858, "trainId": 370},
459 |
{"name": "basket, basketball hoop, hoop", "id": 145, "trainId": 371},
460 |
{"name": "steering wheel, wheel", "id": 2565, "trainId": 372},
461 |
{"name": "pitcher, ewer", "id": 1892, "trainId": 373},
462 |
{"name": "goal", "id": 1103, "trainId": 374},
463 |
{"name": "bread, breadstuff, staff of life", "id": 286, "trainId": 375},
464 |
{"name": "beds", "id": 170, "trainId": 376},
465 |
{"name": "wood", "id": 3073, "trainId": 377},
466 |
{"name": "file cabinet", "id": 922, "trainId": 378},
467 |
{"name": "newspaper, paper", "id": 1655, "trainId": 379},
468 |
{"name": "motorboat", "id": 1602, "trainId": 380},
469 |
{"name": "rope", "id": 2160, "trainId": 381},
470 |
{"name": "guitar", "id": 1151, "trainId": 382},
471 |
{"name": "rubble", "id": 2176, "trainId": 383},
472 |
{"name": "scarf", "id": 2239, "trainId": 384},
473 |
{"name": "barrels", "id": 132, "trainId": 385},
474 |
{"name": "cap", "id": 394, "trainId": 386},
475 |
{"name": "leaves", "id": 1424, "trainId": 387},
476 |
{"name": "control tower", "id": 607, "trainId": 388},
477 |
{"name": "dashboard", "id": 700, "trainId": 389},
478 |
{"name": "bandstand", "id": 116, "trainId": 390},
479 |
{"name": "lectern", "id": 1425, "trainId": 391},
480 |
{"name": "switch, electric switch, electrical switch", "id": 2676, "trainId": 392},
481 |
{"name": "baseboard, mopboard, skirting board", "id": 141, "trainId": 393},
482 |
{"name": "shower room", "id": 2360, "trainId": 394},
483 |
{"name": "smoke", "id": 2449, "trainId": 395},
484 |
{"name": "faucet, spigot", "id": 897, "trainId": 396},
485 |
{"name": "bulldozer", "id": 317, "trainId": 397},
486 |
{"name": "saucepan", "id": 2228, "trainId": 398},
487 |
{"name": "shops", "id": 2351, "trainId": 399},
488 |
{"name": "meter", "id": 1543, "trainId": 400},
489 |
{"name": "crevasse", "id": 656, "trainId": 401},
490 |
{"name": "gear", "id": 1088, "trainId": 402},
491 |
{"name": "candelabrum, candelabra", "id": 373, "trainId": 403},
492 |
{"name": "sofa bed", "id": 2472, "trainId": 404},
493 |
{"name": "tunnel", "id": 2892, "trainId": 405},
494 |
{"name": "pallet", "id": 1740, "trainId": 406},
495 |
{"name": "wire, conducting wire", "id": 3067, "trainId": 407},
496 |
{"name": "kettle, boiler", "id": 1367, "trainId": 408},
497 |
{"name": "bidet", "id": 188, "trainId": 409},
498 |
499 |
"name": "baby buggy, baby carriage, carriage, perambulator, pram, stroller, go-cart, pushchair, pusher",
500 |
"id": 79,
501 |
"trainId": 410,
502 |
503 |
{"name": "music stand", "id": 1633, "trainId": 411},
504 |
{"name": "pipe, tube", "id": 1885, "trainId": 412},
505 |
{"name": "cup", "id": 677, "trainId": 413},
506 |
{"name": "parking meter", "id": 1779, "trainId": 414},
507 |
{"name": "ice hockey rink", "id": 1297, "trainId": 415},
508 |
{"name": "shelter", "id": 2334, "trainId": 416},
509 |
{"name": "weeds", "id": 3027, "trainId": 417},
510 |
{"name": "temple", "id": 2735, "trainId": 418},
511 |
{"name": "patty, cake", "id": 1791, "trainId": 419},
512 |
{"name": "ski slope", "id": 2405, "trainId": 420},
513 |
{"name": "panel", "id": 1748, "trainId": 421},
514 |
{"name": "wallet", "id": 2983, "trainId": 422},
515 |
{"name": "wheel", "id": 3035, "trainId": 423},
516 |
{"name": "towel rack, towel horse", "id": 2824, "trainId": 424},
517 |
{"name": "roundabout", "id": 2168, "trainId": 425},
518 |
{"name": "canister, cannister, tin", "id": 385, "trainId": 426},
519 |
{"name": "rod", "id": 2148, "trainId": 427},
520 |
{"name": "soap dispenser", "id": 2465, "trainId": 428},
521 |
{"name": "bell", "id": 175, "trainId": 429},
522 |
{"name": "canvas", "id": 390, "trainId": 430},
523 |
{"name": "box office, ticket office, ticket booth", "id": 268, "trainId": 431},
524 |
{"name": "teacup", "id": 2722, "trainId": 432},
525 |
{"name": "trellis", "id": 2857, "trainId": 433},
526 |
{"name": "workbench", "id": 3088, "trainId": 434},
527 |
{"name": "valley, vale", "id": 2926, "trainId": 435},
528 |
{"name": "toaster", "id": 2782, "trainId": 436},
529 |
{"name": "knife", "id": 1378, "trainId": 437},
530 |
{"name": "podium", "id": 1934, "trainId": 438},
531 |
{"name": "ramp", "id": 2072, "trainId": 439},
532 |
{"name": "tumble dryer", "id": 2889, "trainId": 440},
533 |
{"name": "fireplug, fire hydrant, plug", "id": 944, "trainId": 441},
534 |
{"name": "gym shoe, sneaker, tennis shoe", "id": 1158, "trainId": 442},
535 |
{"name": "lab bench", "id": 1383, "trainId": 443},
536 |
{"name": "equipment", "id": 867, "trainId": 444},
537 |
{"name": "rocky formation", "id": 2145, "trainId": 445},
538 |
{"name": "plastic", "id": 1915, "trainId": 446},
539 |
{"name": "calendar", "id": 361, "trainId": 447},
540 |
{"name": "caravan", "id": 402, "trainId": 448},
541 |
{"name": "check-in-desk", "id": 482, "trainId": 449},
542 |
{"name": "ticket counter", "id": 2761, "trainId": 450},
543 |
{"name": "brush", "id": 300, "trainId": 451},
544 |
{"name": "mill", "id": 1554, "trainId": 452},
545 |
{"name": "covered bridge", "id": 636, "trainId": 453},
546 |
{"name": "bowling alley", "id": 260, "trainId": 454},
547 |
{"name": "hanger", "id": 1186, "trainId": 455},
548 |
{"name": "excavator", "id": 871, "trainId": 456},
549 |
{"name": "trestle", "id": 2859, "trainId": 457},
550 |
{"name": "revolving door", "id": 2103, "trainId": 458},
551 |
{"name": "blast furnace", "id": 208, "trainId": 459},
552 |
{"name": "scale, weighing machine", "id": 2236, "trainId": 460},
553 |
{"name": "projector", "id": 2012, "trainId": 461},
554 |
{"name": "soap", "id": 2462, "trainId": 462},
555 |
{"name": "locker", "id": 1462, "trainId": 463},
556 |
{"name": "tractor", "id": 2832, "trainId": 464},
557 |
{"name": "stretcher", "id": 2617, "trainId": 465},
558 |
{"name": "frame", "id": 1024, "trainId": 466},
559 |
{"name": "grating", "id": 1129, "trainId": 467},
560 |
{"name": "alembic", "id": 18, "trainId": 468},
561 |
{"name": "candle, taper, wax light", "id": 376, "trainId": 469},
562 |
{"name": "barrier", "id": 134, "trainId": 470},
563 |
{"name": "cardboard", "id": 407, "trainId": 471},
564 |
{"name": "cave", "id": 434, "trainId": 472},
565 |
{"name": "puddle", "id": 2017, "trainId": 473},
566 |
{"name": "tarp", "id": 2717, "trainId": 474},
567 |
{"name": "price tag", "id": 2005, "trainId": 475},
568 |
{"name": "watchtower", "id": 2993, "trainId": 476},
569 |
{"name": "meters", "id": 1545, "trainId": 477},
570 |
571 |
"name": "light bulb, lightbulb, bulb, incandescent lamp, electric light, electric-light bulb",
572 |
"id": 1445,
573 |
"trainId": 478,
574 |
575 |
{"name": "tracks", "id": 2831, "trainId": 479},
576 |
{"name": "hair dryer", "id": 1161, "trainId": 480},
577 |
{"name": "skirt", "id": 2411, "trainId": 481},
578 |
{"name": "viaduct", "id": 2949, "trainId": 482},
579 |
{"name": "paper towel", "id": 1769, "trainId": 483},
580 |
{"name": "coat", "id": 552, "trainId": 484},
581 |
{"name": "sheet", "id": 2327, "trainId": 485},
582 |
{"name": "fire extinguisher, extinguisher, asphyxiator", "id": 939, "trainId": 486},
583 |
{"name": "water wheel", "id": 3013, "trainId": 487},
584 |
{"name": "pottery, clayware", "id": 1986, "trainId": 488},
585 |
{"name": "magazine rack", "id": 1486, "trainId": 489},
586 |
{"name": "teapot", "id": 2723, "trainId": 490},
587 |
{"name": "microphone, mike", "id": 1549, "trainId": 491},
588 |
{"name": "support", "id": 2649, "trainId": 492},
589 |
{"name": "forklift", "id": 1020, "trainId": 493},
590 |
{"name": "canyon", "id": 392, "trainId": 494},
591 |
{"name": "cash register, register", "id": 422, "trainId": 495},
592 |
{"name": "leaf, leafage, foliage", "id": 1419, "trainId": 496},
593 |
{"name": "remote control, remote", "id": 2099, "trainId": 497},
594 |
{"name": "soap dish", "id": 2464, "trainId": 498},
595 |
{"name": "windshield, windscreen", "id": 3058, "trainId": 499},
596 |
{"name": "cat", "id": 430, "trainId": 500},
597 |
{"name": "cue, cue stick, pool cue, pool stick", "id": 675, "trainId": 501},
598 |
{"name": "vent, venthole, vent-hole, blowhole", "id": 2941, "trainId": 502},
599 |
{"name": "videos", "id": 2955, "trainId": 503},
600 |
{"name": "shovel", "id": 2355, "trainId": 504},
601 |
{"name": "eaves", "id": 840, "trainId": 505},
602 |
{"name": "antenna, aerial, transmitting aerial", "id": 32, "trainId": 506},
603 |
{"name": "shipyard", "id": 2338, "trainId": 507},
604 |
{"name": "hen, biddy", "id": 1232, "trainId": 508},
605 |
{"name": "traffic cone", "id": 2834, "trainId": 509},
606 |
{"name": "washing machines", "id": 2991, "trainId": 510},
607 |
{"name": "truck crane", "id": 2879, "trainId": 511},
608 |
{"name": "cds", "id": 444, "trainId": 512},
609 |
{"name": "niche", "id": 1657, "trainId": 513},
610 |
{"name": "scoreboard", "id": 2246, "trainId": 514},
611 |
{"name": "briefcase", "id": 296, "trainId": 515},
612 |
{"name": "boot", "id": 245, "trainId": 516},
613 |
{"name": "sweater, jumper", "id": 2661, "trainId": 517},
614 |
{"name": "hay", "id": 1202, "trainId": 518},
615 |
{"name": "pack", "id": 1714, "trainId": 519},
616 |
{"name": "bottle rack", "id": 251, "trainId": 520},
617 |
{"name": "glacier", "id": 1095, "trainId": 521},
618 |
{"name": "pergola", "id": 1828, "trainId": 522},
619 |
{"name": "building materials", "id": 311, "trainId": 523},
620 |
{"name": "television camera", "id": 2732, "trainId": 524},
621 |
{"name": "first floor", "id": 947, "trainId": 525},
622 |
{"name": "rifle", "id": 2115, "trainId": 526},
623 |
{"name": "tennis table", "id": 2738, "trainId": 527},
624 |
{"name": "stadium", "id": 2525, "trainId": 528},
625 |
{"name": "safety belt", "id": 2194, "trainId": 529},
626 |
{"name": "cover", "id": 634, "trainId": 530},
627 |
{"name": "dish rack", "id": 740, "trainId": 531},
628 |
{"name": "synthesizer", "id": 2682, "trainId": 532},
629 |
{"name": "pumpkin", "id": 2020, "trainId": 533},
630 |
{"name": "gutter", "id": 1156, "trainId": 534},
631 |
{"name": "fruit stand", "id": 1036, "trainId": 535},
632 |
{"name": "ice floe, floe", "id": 1295, "trainId": 536},
633 |
{"name": "handle, grip, handgrip, hold", "id": 1181, "trainId": 537},
634 |
{"name": "wheelchair", "id": 3037, "trainId": 538},
635 |
{"name": "mousepad, mouse mat", "id": 1614, "trainId": 539},
636 |
{"name": "diploma", "id": 736, "trainId": 540},
637 |
{"name": "fairground ride", "id": 893, "trainId": 541},
638 |
{"name": "radio", "id": 2047, "trainId": 542},
639 |
{"name": "hotplate", "id": 1274, "trainId": 543},
640 |
{"name": "junk", "id": 1361, "trainId": 544},
641 |
{"name": "wheelbarrow", "id": 3036, "trainId": 545},
642 |
{"name": "stream", "id": 2606, "trainId": 546},
643 |
{"name": "toll plaza", "id": 2797, "trainId": 547},
644 |
{"name": "punching bag", "id": 2022, "trainId": 548},
645 |
{"name": "trough", "id": 2876, "trainId": 549},
646 |
{"name": "throne", "id": 2758, "trainId": 550},
647 |
{"name": "chair desk", "id": 472, "trainId": 551},
648 |
{"name": "weighbridge", "id": 3028, "trainId": 552},
649 |
{"name": "extractor fan", "id": 882, "trainId": 553},
650 |
{"name": "hanging clothes", "id": 1189, "trainId": 554},
651 |
{"name": "dish, dish aerial, dish antenna, saucer", "id": 743, "trainId": 555},
652 |
{"name": "alarm clock, alarm", "id": 3122, "trainId": 556},
653 |
{"name": "ski lift", "id": 2401, "trainId": 557},
654 |
{"name": "chain", "id": 468, "trainId": 558},
655 |
{"name": "garage", "id": 1061, "trainId": 559},
656 |
{"name": "mechanical shovel", "id": 1523, "trainId": 560},
657 |
{"name": "wine rack", "id": 3059, "trainId": 561},
658 |
{"name": "tramway", "id": 2843, "trainId": 562},
659 |
{"name": "treadmill", "id": 2853, "trainId": 563},
660 |
{"name": "menu", "id": 1529, "trainId": 564},
661 |
{"name": "block", "id": 214, "trainId": 565},
662 |
{"name": "well", "id": 3032, "trainId": 566},
663 |
{"name": "witness stand", "id": 3071, "trainId": 567},
664 |
{"name": "branch", "id": 277, "trainId": 568},
665 |
{"name": "duck", "id": 813, "trainId": 569},
666 |
{"name": "casserole", "id": 426, "trainId": 570},
667 |
{"name": "frying pan", "id": 1039, "trainId": 571},
668 |
{"name": "desk organizer", "id": 727, "trainId": 572},
669 |
{"name": "mast", "id": 1508, "trainId": 573},
670 |
{"name": "spectacles, specs, eyeglasses, glasses", "id": 2490, "trainId": 574},
671 |
{"name": "service elevator", "id": 2299, "trainId": 575},
672 |
{"name": "dollhouse", "id": 768, "trainId": 576},
673 |
{"name": "hammock", "id": 1172, "trainId": 577},
674 |
{"name": "clothes hanging", "id": 537, "trainId": 578},
675 |
{"name": "photocopier", "id": 1847, "trainId": 579},
676 |
{"name": "notepad", "id": 1664, "trainId": 580},
677 |
{"name": "golf cart", "id": 1110, "trainId": 581},
678 |
{"name": "footpath", "id": 1014, "trainId": 582},
679 |
{"name": "cross", "id": 662, "trainId": 583},
680 |
{"name": "baptismal font", "id": 121, "trainId": 584},
681 |
{"name": "boiler", "id": 227, "trainId": 585},
682 |
{"name": "skip", "id": 2410, "trainId": 586},
683 |
{"name": "rotisserie", "id": 2165, "trainId": 587},
684 |
{"name": "tables", "id": 2696, "trainId": 588},
685 |
{"name": "water mill", "id": 3005, "trainId": 589},
686 |
{"name": "helmet", "id": 1231, "trainId": 590},
687 |
{"name": "cover curtain", "id": 635, "trainId": 591},
688 |
{"name": "brick", "id": 292, "trainId": 592},
689 |
{"name": "table runner", "id": 2690, "trainId": 593},
690 |
{"name": "ashtray", "id": 65, "trainId": 594},
691 |
{"name": "street box", "id": 2607, "trainId": 595},
692 |
{"name": "stick", "id": 2574, "trainId": 596},
693 |
{"name": "hangers", "id": 1188, "trainId": 597},
694 |
{"name": "cells", "id": 456, "trainId": 598},
695 |
{"name": "urinal", "id": 2913, "trainId": 599},
696 |
{"name": "centerpiece", "id": 459, "trainId": 600},
697 |
{"name": "portable fridge", "id": 1955, "trainId": 601},
698 |
{"name": "dvds", "id": 827, "trainId": 602},
699 |
{"name": "golf club", "id": 1111, "trainId": 603},
700 |
{"name": "skirting board", "id": 2412, "trainId": 604},
701 |
{"name": "water cooler", "id": 2997, "trainId": 605},
702 |
{"name": "clipboard", "id": 528, "trainId": 606},
703 |
{"name": "camera, photographic camera", "id": 366, "trainId": 607},
704 |
{"name": "pigeonhole", "id": 1863, "trainId": 608},
705 |
{"name": "chips", "id": 500, "trainId": 609},
706 |
{"name": "food processor", "id": 1001, "trainId": 610},
707 |
{"name": "post box", "id": 1958, "trainId": 611},
708 |
{"name": "lid", "id": 1441, "trainId": 612},
709 |
{"name": "drum", "id": 809, "trainId": 613},
710 |
{"name": "blender", "id": 210, "trainId": 614},
711 |
{"name": "cave entrance", "id": 435, "trainId": 615},
712 |
{"name": "dental chair", "id": 718, "trainId": 616},
713 |
{"name": "obelisk", "id": 1674, "trainId": 617},
714 |
{"name": "canoe", "id": 388, "trainId": 618},
715 |
{"name": "mobile", "id": 1572, "trainId": 619},
716 |
{"name": "monitors", "id": 1584, "trainId": 620},
717 |
{"name": "pool ball", "id": 1944, "trainId": 621},
718 |
{"name": "cue rack", "id": 674, "trainId": 622},
719 |
{"name": "baggage carts", "id": 99, "trainId": 623},
720 |
{"name": "shore", "id": 2352, "trainId": 624},
721 |
{"name": "fork", "id": 1019, "trainId": 625},
722 |
{"name": "paper filer", "id": 1763, "trainId": 626},
723 |
{"name": "bicycle rack", "id": 185, "trainId": 627},
724 |
{"name": "coat rack", "id": 554, "trainId": 628},
725 |
{"name": "garland", "id": 1066, "trainId": 629},
726 |
{"name": "sports bag", "id": 2508, "trainId": 630},
727 |
{"name": "fish tank", "id": 951, "trainId": 631},
728 |
{"name": "towel dispenser", "id": 2822, "trainId": 632},
729 |
{"name": "carriage", "id": 415, "trainId": 633},
730 |
{"name": "brochure", "id": 297, "trainId": 634},
731 |
{"name": "plaque", "id": 1914, "trainId": 635},
732 |
{"name": "stringer", "id": 2619, "trainId": 636},
733 |
{"name": "iron", "id": 1338, "trainId": 637},
734 |
{"name": "spoon", "id": 2505, "trainId": 638},
735 |
{"name": "flag pole", "id": 955, "trainId": 639},
736 |
{"name": "toilet brush", "id": 2786, "trainId": 640},
737 |
{"name": "book stand", "id": 238, "trainId": 641},
738 |
{"name": "water faucet, water tap, tap, hydrant", "id": 3000, "trainId": 642},
739 |
{"name": "ticket office", "id": 2763, "trainId": 643},
740 |
{"name": "broom", "id": 299, "trainId": 644},
741 |
{"name": "dvd", "id": 822, "trainId": 645},
742 |
{"name": "ice bucket", "id": 1288, "trainId": 646},
743 |
{"name": "carapace, shell, cuticle, shield", "id": 3101, "trainId": 647},
744 |
{"name": "tureen", "id": 2894, "trainId": 648},
745 |
{"name": "folders", "id": 992, "trainId": 649},
746 |
{"name": "chess", "id": 489, "trainId": 650},
747 |
{"name": "root", "id": 2157, "trainId": 651},
748 |
{"name": "sewing machine", "id": 2309, "trainId": 652},
749 |
{"name": "model", "id": 1576, "trainId": 653},
750 |
{"name": "pen", "id": 1810, "trainId": 654},
751 |
{"name": "violin", "id": 2964, "trainId": 655},
752 |
{"name": "sweatshirt", "id": 2662, "trainId": 656},
753 |
{"name": "recycling materials", "id": 2087, "trainId": 657},
754 |
{"name": "mitten", "id": 1569, "trainId": 658},
755 |
{"name": "chopping board, cutting board", "id": 503, "trainId": 659},
756 |
{"name": "mask", "id": 1505, "trainId": 660},
757 |
{"name": "log", "id": 1468, "trainId": 661},
758 |
{"name": "mouse, computer mouse", "id": 1613, "trainId": 662},
759 |
{"name": "grill", "id": 1138, "trainId": 663},
760 |
{"name": "hole", "id": 1256, "trainId": 664},
761 |
{"name": "target", "id": 2715, "trainId": 665},
762 |
{"name": "trash bag", "id": 2846, "trainId": 666},
763 |
{"name": "chalk", "id": 477, "trainId": 667},
764 |
{"name": "sticks", "id": 2576, "trainId": 668},
765 |
{"name": "balloon", "id": 108, "trainId": 669},
766 |
{"name": "score", "id": 2245, "trainId": 670},
767 |
{"name": "hair spray", "id": 1162, "trainId": 671},
768 |
{"name": "roll", "id": 2149, "trainId": 672},
769 |
{"name": "runner", "id": 2183, "trainId": 673},
770 |
{"name": "engine", "id": 858, "trainId": 674},
771 |
{"name": "inflatable glove", "id": 1324, "trainId": 675},
772 |
{"name": "games", "id": 1055, "trainId": 676},
773 |
{"name": "pallets", "id": 1741, "trainId": 677},
774 |
{"name": "baskets", "id": 149, "trainId": 678},
775 |
{"name": "coop", "id": 615, "trainId": 679},
776 |
{"name": "dvd player", "id": 825, "trainId": 680},
777 |
{"name": "rocking horse", "id": 2143, "trainId": 681},
778 |
{"name": "buckets", "id": 304, "trainId": 682},
779 |
{"name": "bread rolls", "id": 283, "trainId": 683},
780 |
{"name": "shawl", "id": 2322, "trainId": 684},
781 |
{"name": "watering can", "id": 3017, "trainId": 685},
782 |
{"name": "spotlights", "id": 2510, "trainId": 686},
783 |
{"name": "post-it", "id": 1960, "trainId": 687},
784 |
{"name": "bowls", "id": 265, "trainId": 688},
785 |
{"name": "security camera", "id": 2282, "trainId": 689},
786 |
{"name": "runner cloth", "id": 2184, "trainId": 690},
787 |
{"name": "lock", "id": 1461, "trainId": 691},
788 |
{"name": "alarm, warning device, alarm system", "id": 3113, "trainId": 692},
789 |
{"name": "side", "id": 2372, "trainId": 693},
790 |
{"name": "roulette", "id": 2166, "trainId": 694},
791 |
{"name": "bone", "id": 232, "trainId": 695},
792 |
{"name": "cutlery", "id": 693, "trainId": 696},
793 |
{"name": "pool balls", "id": 1945, "trainId": 697},
794 |
{"name": "wheels", "id": 3039, "trainId": 698},
795 |
{"name": "spice rack", "id": 2494, "trainId": 699},
796 |
{"name": "plant pots", "id": 1908, "trainId": 700},
797 |
{"name": "towel ring", "id": 2827, "trainId": 701},
798 |
{"name": "bread box", "id": 280, "trainId": 702},
799 |
{"name": "video", "id": 2950, "trainId": 703},
800 |
{"name": "funfair", "id": 1044, "trainId": 704},
801 |
{"name": "breads", "id": 288, "trainId": 705},
802 |
{"name": "tripod", "id": 2863, "trainId": 706},
803 |
{"name": "ironing board", "id": 1342, "trainId": 707},
804 |
{"name": "skimmer", "id": 2409, "trainId": 708},
805 |
{"name": "hollow", "id": 1258, "trainId": 709},
806 |
{"name": "scratching post", "id": 2249, "trainId": 710},
807 |
{"name": "tricycle", "id": 2862, "trainId": 711},
808 |
{"name": "file box", "id": 920, "trainId": 712},
809 |
{"name": "mountain pass", "id": 1607, "trainId": 713},
810 |
{"name": "tombstones", "id": 2802, "trainId": 714},
811 |
{"name": "cooker", "id": 610, "trainId": 715},
812 |
{"name": "card game, cards", "id": 3129, "trainId": 716},
813 |
{"name": "golf bag", "id": 1108, "trainId": 717},
814 |
{"name": "towel paper", "id": 2823, "trainId": 718},
815 |
{"name": "chaise lounge", "id": 476, "trainId": 719},
816 |
{"name": "sun", "id": 2641, "trainId": 720},
817 |
{"name": "toilet paper holder", "id": 2788, "trainId": 721},
818 |
{"name": "rake", "id": 2070, "trainId": 722},
819 |
{"name": "key", "id": 1368, "trainId": 723},
820 |
{"name": "umbrella stand", "id": 2903, "trainId": 724},
821 |
{"name": "dartboard", "id": 699, "trainId": 725},
822 |
{"name": "transformer", "id": 2844, "trainId": 726},
823 |
{"name": "fireplace utensils", "id": 942, "trainId": 727},
824 |
{"name": "sweatshirts", "id": 2663, "trainId": 728},
825 |
826 |
"name": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
827 |
"id": 457,
828 |
"trainId": 729,
829 |
830 |
{"name": "tallboy", "id": 2701, "trainId": 730},
831 |
{"name": "stapler", "id": 2540, "trainId": 731},
832 |
{"name": "sauna", "id": 2231, "trainId": 732},
833 |
{"name": "test tube", "id": 2746, "trainId": 733},
834 |
{"name": "palette", "id": 1738, "trainId": 734},
835 |
{"name": "shopping carts", "id": 2350, "trainId": 735},
836 |
{"name": "tools", "id": 2808, "trainId": 736},
837 |
{"name": "push button, push, button", "id": 2025, "trainId": 737},
838 |
{"name": "star", "id": 2541, "trainId": 738},
839 |
{"name": "roof rack", "id": 2156, "trainId": 739},
840 |
{"name": "barbed wire", "id": 126, "trainId": 740},
841 |
{"name": "spray", "id": 2512, "trainId": 741},
842 |
{"name": "ear", "id": 831, "trainId": 742},
843 |
{"name": "sponge", "id": 2503, "trainId": 743},
844 |
{"name": "racket", "id": 2039, "trainId": 744},
845 |
{"name": "tins", "id": 2774, "trainId": 745},
846 |
{"name": "eyeglasses", "id": 886, "trainId": 746},
847 |
{"name": "file", "id": 919, "trainId": 747},
848 |
{"name": "scarfs", "id": 2240, "trainId": 748},
849 |
{"name": "sugar bowl", "id": 2636, "trainId": 749},
850 |
{"name": "flip flop", "id": 963, "trainId": 750},
851 |
{"name": "headstones", "id": 1218, "trainId": 751},
852 |
{"name": "laptop bag", "id": 1406, "trainId": 752},
853 |
{"name": "leash", "id": 1420, "trainId": 753},
854 |
{"name": "climbing frame", "id": 526, "trainId": 754},
855 |
{"name": "suit hanger", "id": 2639, "trainId": 755},
856 |
{"name": "floor spotlight", "id": 975, "trainId": 756},
857 |
{"name": "plate rack", "id": 1921, "trainId": 757},
858 |
{"name": "sewer", "id": 2305, "trainId": 758},
859 |
{"name": "hard drive", "id": 1193, "trainId": 759},
860 |
{"name": "sprinkler", "id": 2517, "trainId": 760},
861 |
{"name": "tools box", "id": 2809, "trainId": 761},
862 |
{"name": "necklace", "id": 1647, "trainId": 762},
863 |
{"name": "bulbs", "id": 314, "trainId": 763},
864 |
{"name": "steel industry", "id": 2560, "trainId": 764},
865 |
{"name": "club", "id": 545, "trainId": 765},
866 |
{"name": "jack", "id": 1345, "trainId": 766},
867 |
{"name": "door bars", "id": 775, "trainId": 767},
868 |
869 |
"name": "control panel, instrument panel, control board, board, panel",
870 |
"id": 603,
871 |
"trainId": 768,
872 |
873 |
{"name": "hairbrush", "id": 1163, "trainId": 769},
874 |
{"name": "napkin holder", "id": 1641, "trainId": 770},
875 |
{"name": "office", "id": 1678, "trainId": 771},
876 |
{"name": "smoke detector", "id": 2450, "trainId": 772},
877 |
{"name": "utensils", "id": 2915, "trainId": 773},
878 |
{"name": "apron", "id": 42, "trainId": 774},
879 |
{"name": "scissors", "id": 2242, "trainId": 775},
880 |
{"name": "terminal", "id": 2741, "trainId": 776},
881 |
{"name": "grinder", "id": 1143, "trainId": 777},
882 |
{"name": "entry phone", "id": 862, "trainId": 778},
883 |
{"name": "newspaper stand", "id": 1654, "trainId": 779},
884 |
{"name": "pepper shaker", "id": 1826, "trainId": 780},
885 |
{"name": "onions", "id": 1689, "trainId": 781},
886 |
887 |
"name": "central processing unit, cpu, c p u , central processor, processor, mainframe",
888 |
"id": 3124,
889 |
"trainId": 782,
890 |
891 |
{"name": "tape", "id": 2710, "trainId": 783},
892 |
{"name": "bat", "id": 152, "trainId": 784},
893 |
{"name": "coaster", "id": 549, "trainId": 785},
894 |
{"name": "calculator", "id": 360, "trainId": 786},
895 |
{"name": "potatoes", "id": 1982, "trainId": 787},
896 |
{"name": "luggage rack", "id": 1478, "trainId": 788},
897 |
{"name": "salt", "id": 2203, "trainId": 789},
898 |
{"name": "street number", "id": 2612, "trainId": 790},
899 |
{"name": "viewpoint", "id": 2956, "trainId": 791},
900 |
{"name": "sword", "id": 2681, "trainId": 792},
901 |
{"name": "cd", "id": 437, "trainId": 793},
902 |
{"name": "rowing machine", "id": 2171, "trainId": 794},
903 |
{"name": "plug", "id": 1933, "trainId": 795},
904 |
{"name": "andiron, firedog, dog, dog-iron", "id": 3110, "trainId": 796},
905 |
{"name": "pepper", "id": 1824, "trainId": 797},
906 |
{"name": "tongs", "id": 2803, "trainId": 798},
907 |
{"name": "bonfire", "id": 234, "trainId": 799},
908 |
{"name": "dog dish", "id": 764, "trainId": 800},
909 |
{"name": "belt", "id": 177, "trainId": 801},
910 |
{"name": "dumbbells", "id": 817, "trainId": 802},
911 |
{"name": "videocassette recorder, vcr", "id": 3145, "trainId": 803},
912 |
{"name": "hook", "id": 1262, "trainId": 804},
913 |
{"name": "envelopes", "id": 864, "trainId": 805},
914 |
{"name": "shower faucet", "id": 2359, "trainId": 806},
915 |
{"name": "watch", "id": 2992, "trainId": 807},
916 |
{"name": "padlock", "id": 1725, "trainId": 808},
917 |
{"name": "swimming pool ladder", "id": 2667, "trainId": 809},
918 |
{"name": "spanners", "id": 2484, "trainId": 810},
919 |
{"name": "gravy boat", "id": 1133, "trainId": 811},
920 |
{"name": "notice board", "id": 1667, "trainId": 812},
921 |
{"name": "trash bags", "id": 2847, "trainId": 813},
922 |
{"name": "fire alarm", "id": 932, "trainId": 814},
923 |
{"name": "ladle", "id": 1392, "trainId": 815},
924 |
{"name": "stethoscope", "id": 2573, "trainId": 816},
925 |
{"name": "rocket", "id": 2140, "trainId": 817},
926 |
{"name": "funnel", "id": 1046, "trainId": 818},
927 |
{"name": "bowling pins", "id": 264, "trainId": 819},
928 |
{"name": "valve", "id": 2927, "trainId": 820},
929 |
{"name": "thermometer", "id": 2752, "trainId": 821},
930 |
{"name": "cups", "id": 679, "trainId": 822},
931 |
{"name": "spice jar", "id": 2493, "trainId": 823},
932 |
{"name": "night light", "id": 1658, "trainId": 824},
933 |
{"name": "soaps", "id": 2466, "trainId": 825},
934 |
{"name": "games table", "id": 1057, "trainId": 826},
935 |
{"name": "slotted spoon", "id": 2444, "trainId": 827},
936 |
{"name": "reel", "id": 2093, "trainId": 828},
937 |
{"name": "scourer", "id": 2248, "trainId": 829},
938 |
{"name": "sleeping robe", "id": 2432, "trainId": 830},
939 |
{"name": "desk mat", "id": 726, "trainId": 831},
940 |
{"name": "dumbbell", "id": 816, "trainId": 832},
941 |
{"name": "hammer", "id": 1171, "trainId": 833},
942 |
{"name": "tie", "id": 2766, "trainId": 834},
943 |
{"name": "typewriter", "id": 2900, "trainId": 835},
944 |
{"name": "shaker", "id": 2313, "trainId": 836},
945 |
{"name": "cheese dish", "id": 488, "trainId": 837},
946 |
{"name": "sea star", "id": 2265, "trainId": 838},
947 |
{"name": "racquet", "id": 2043, "trainId": 839},
948 |
{"name": "butane gas cylinder", "id": 332, "trainId": 840},
949 |
{"name": "paper weight", "id": 1771, "trainId": 841},
950 |
{"name": "shaving brush", "id": 2320, "trainId": 842},
951 |
{"name": "sunglasses", "id": 2646, "trainId": 843},
952 |
{"name": "gear shift", "id": 1089, "trainId": 844},
953 |
{"name": "towel rail", "id": 2826, "trainId": 845},
954 |
{"name": "adding machine, totalizer, totaliser", "id": 3148, "trainId": 846},
955 |
956 |
957 |
958 |
def _get_ade20k_full_meta():
959 |
stuff_ids = [k["id"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES]
960 |
assert len(stuff_ids) == 847, len(stuff_ids)
961 |
962 |
stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)}
963 |
stuff_classes = [k["name"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES]
964 |
965 |
ret = {
966 |
"stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
967 |
"stuff_classes": stuff_classes,
968 |
969 |
return ret
970 |
971 |
972 |
def register_all_ade20k_full(root):
973 |
meta = _get_ade20k_full_meta()
974 |
for name, dirname in [("val", "validation")]:
975 |
image_dir = os.path.join(root, "ADE20K_2021_17_01/images_detectron2", dirname)
976 |
gt_dir = os.path.join(root, "ADE20K_2021_17_01/annotations_detectron2", dirname)
977 |
name = f"ade20k_full_sem_seg_{name}"
978 |
979 |
980 |
lambda x=image_dir, y=gt_dir: load_sem_seg(
981 |
y, x, gt_ext="tif", image_ext="jpg"
982 |
983 |
984 |
985 |
986 |
thing_classes=meta["stuff_classes"][:], # the same as stuff_classes
987 |
988 |
989 |
990 |
ignore_label=65535, # NOTE: gt is saved in 16-bit TIFF images
991 |
992 |
993 |
994 |
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
995 |
@@ -0,0 +1,457 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
import os
3 |
4 |
import pandas as pd
5 |
from import DatasetCatalog, MetadataCatalog
6 |
from import load_sem_seg
7 |
from detectron2.utils.file_io import PathManager
8 |
9 |
10 |
11 |
{"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
12 |
{"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
13 |
{"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
14 |
{"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
15 |
{"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
16 |
{"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
17 |
{"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
18 |
{"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
19 |
{"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
20 |
{"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
21 |
{"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
22 |
{"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
23 |
{"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
24 |
{"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
25 |
{"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
26 |
{"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
27 |
{"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
28 |
{"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
29 |
{"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
30 |
{"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
31 |
{"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
32 |
{"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
33 |
{"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
34 |
{"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
35 |
{"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
36 |
{"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
37 |
{"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
38 |
{"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
39 |
{"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
40 |
{"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
41 |
{"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
42 |
{"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
43 |
{"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
44 |
{"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
45 |
{"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
46 |
{"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
47 |
{"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
48 |
{"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
49 |
{"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
50 |
{"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
51 |
{"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
52 |
{"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
53 |
{"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
54 |
{"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
55 |
{"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
56 |
{"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
57 |
{"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
58 |
{"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
59 |
{"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
60 |
{"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
61 |
{"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
62 |
{"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
63 |
{"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
64 |
{"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
65 |
{"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
66 |
{"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
67 |
{"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
68 |
{"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
69 |
{"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
70 |
{"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
71 |
{"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
72 |
{"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
73 |
{"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
74 |
{"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
75 |
{"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
76 |
{"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
77 |
{"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
78 |
{"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
79 |
{"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
80 |
{"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
81 |
{"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
82 |
{"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
83 |
{"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
84 |
{"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
85 |
{"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
86 |
{"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
87 |
{"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
88 |
{"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
89 |
{"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
90 |
{"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
91 |
{"id": 92, "name": "banner", "supercategory": "textile"},
92 |
{"id": 93, "name": "blanket", "supercategory": "textile"},
93 |
{"id": 94, "name": "branch", "supercategory": "plant"},
94 |
{"id": 95, "name": "bridge", "supercategory": "building"},
95 |
{"id": 96, "name": "building-other", "supercategory": "building"},
96 |
{"id": 97, "name": "bush", "supercategory": "plant"},
97 |
{"id": 98, "name": "cabinet", "supercategory": "furniture-stuff"},
98 |
{"id": 99, "name": "cage", "supercategory": "structural"},
99 |
{"id": 100, "name": "cardboard", "supercategory": "raw-material"},
100 |
{"id": 101, "name": "carpet", "supercategory": "floor"},
101 |
{"id": 102, "name": "ceiling-other", "supercategory": "ceiling"},
102 |
{"id": 103, "name": "ceiling-tile", "supercategory": "ceiling"},
103 |
{"id": 104, "name": "cloth", "supercategory": "textile"},
104 |
{"id": 105, "name": "clothes", "supercategory": "textile"},
105 |
{"id": 106, "name": "clouds", "supercategory": "sky"},
106 |
{"id": 107, "name": "counter", "supercategory": "furniture-stuff"},
107 |
{"id": 108, "name": "cupboard", "supercategory": "furniture-stuff"},
108 |
{"id": 109, "name": "curtain", "supercategory": "textile"},
109 |
{"id": 110, "name": "desk-stuff", "supercategory": "furniture-stuff"},
110 |
{"id": 111, "name": "dirt", "supercategory": "ground"},
111 |
{"id": 112, "name": "door-stuff", "supercategory": "furniture-stuff"},
112 |
{"id": 113, "name": "fence", "supercategory": "structural"},
113 |
{"id": 114, "name": "floor-marble", "supercategory": "floor"},
114 |
{"id": 115, "name": "floor-other", "supercategory": "floor"},
115 |
{"id": 116, "name": "floor-stone", "supercategory": "floor"},
116 |
{"id": 117, "name": "floor-tile", "supercategory": "floor"},
117 |
{"id": 118, "name": "floor-wood", "supercategory": "floor"},
118 |
{"id": 119, "name": "flower", "supercategory": "plant"},
119 |
{"id": 120, "name": "fog", "supercategory": "water"},
120 |
{"id": 121, "name": "food-other", "supercategory": "food-stuff"},
121 |
{"id": 122, "name": "fruit", "supercategory": "food-stuff"},
122 |
{"id": 123, "name": "furniture-other", "supercategory": "furniture-stuff"},
123 |
{"id": 124, "name": "grass", "supercategory": "plant"},
124 |
{"id": 125, "name": "gravel", "supercategory": "ground"},
125 |
{"id": 126, "name": "ground-other", "supercategory": "ground"},
126 |
{"id": 127, "name": "hill", "supercategory": "solid"},
127 |
{"id": 128, "name": "house", "supercategory": "building"},
128 |
{"id": 129, "name": "leaves", "supercategory": "plant"},
129 |
{"id": 130, "name": "light", "supercategory": "furniture-stuff"},
130 |
{"id": 131, "name": "mat", "supercategory": "textile"},
131 |
{"id": 132, "name": "metal", "supercategory": "raw-material"},
132 |
{"id": 133, "name": "mirror-stuff", "supercategory": "furniture-stuff"},
133 |
{"id": 134, "name": "moss", "supercategory": "plant"},
134 |
{"id": 135, "name": "mountain", "supercategory": "solid"},
135 |
{"id": 136, "name": "mud", "supercategory": "ground"},
136 |
{"id": 137, "name": "napkin", "supercategory": "textile"},
137 |
{"id": 138, "name": "net", "supercategory": "structural"},
138 |
{"id": 139, "name": "paper", "supercategory": "raw-material"},
139 |
{"id": 140, "name": "pavement", "supercategory": "ground"},
140 |
{"id": 141, "name": "pillow", "supercategory": "textile"},
141 |
{"id": 142, "name": "plant-other", "supercategory": "plant"},
142 |
{"id": 143, "name": "plastic", "supercategory": "raw-material"},
143 |
{"id": 144, "name": "platform", "supercategory": "ground"},
144 |
{"id": 145, "name": "playingfield", "supercategory": "ground"},
145 |
{"id": 146, "name": "railing", "supercategory": "structural"},
146 |
{"id": 147, "name": "railroad", "supercategory": "ground"},
147 |
{"id": 148, "name": "river", "supercategory": "water"},
148 |
{"id": 149, "name": "road", "supercategory": "ground"},
149 |
{"id": 150, "name": "rock", "supercategory": "solid"},
150 |
{"id": 151, "name": "roof", "supercategory": "building"},
151 |
{"id": 152, "name": "rug", "supercategory": "textile"},
152 |
{"id": 153, "name": "salad", "supercategory": "food-stuff"},
153 |
{"id": 154, "name": "sand", "supercategory": "ground"},
154 |
{"id": 155, "name": "sea", "supercategory": "water"},
155 |
{"id": 156, "name": "shelf", "supercategory": "furniture-stuff"},
156 |
{"id": 157, "name": "sky-other", "supercategory": "sky"},
157 |
{"id": 158, "name": "skyscraper", "supercategory": "building"},
158 |
{"id": 159, "name": "snow", "supercategory": "ground"},
159 |
{"id": 160, "name": "solid-other", "supercategory": "solid"},
160 |
{"id": 161, "name": "stairs", "supercategory": "furniture-stuff"},
161 |
{"id": 162, "name": "stone", "supercategory": "solid"},
162 |
{"id": 163, "name": "straw", "supercategory": "plant"},
163 |
{"id": 164, "name": "structural-other", "supercategory": "structural"},
164 |
{"id": 165, "name": "table", "supercategory": "furniture-stuff"},
165 |
{"id": 166, "name": "tent", "supercategory": "building"},
166 |
{"id": 167, "name": "textile-other", "supercategory": "textile"},
167 |
{"id": 168, "name": "towel", "supercategory": "textile"},
168 |
{"id": 169, "name": "tree", "supercategory": "plant"},
169 |
{"id": 170, "name": "vegetable", "supercategory": "food-stuff"},
170 |
{"id": 171, "name": "wall-brick", "supercategory": "wall"},
171 |
{"id": 172, "name": "wall-concrete", "supercategory": "wall"},
172 |
{"id": 173, "name": "wall-other", "supercategory": "wall"},
173 |
{"id": 174, "name": "wall-panel", "supercategory": "wall"},
174 |
{"id": 175, "name": "wall-stone", "supercategory": "wall"},
175 |
{"id": 176, "name": "wall-tile", "supercategory": "wall"},
176 |
{"id": 177, "name": "wall-wood", "supercategory": "wall"},
177 |
{"id": 178, "name": "water-other", "supercategory": "water"},
178 |
{"id": 179, "name": "waterdrops", "supercategory": "water"},
179 |
{"id": 180, "name": "window-blind", "supercategory": "window"},
180 |
{"id": 181, "name": "window-other", "supercategory": "window"},
181 |
{"id": 182, "name": "wood", "supercategory": "solid"},
182 |
183 |
184 |
185 |
186 |
{"color": [120, 120, 120], "id": 0, "isthing": 0, "name": "wall"},
187 |
{"color": [180, 120, 120], "id": 1, "isthing": 0, "name": "building"},
188 |
{"color": [6, 230, 230], "id": 2, "isthing": 0, "name": "sky"},
189 |
{"color": [80, 50, 50], "id": 3, "isthing": 0, "name": "floor"},
190 |
{"color": [4, 200, 3], "id": 4, "isthing": 0, "name": "tree"},
191 |
{"color": [120, 120, 80], "id": 5, "isthing": 0, "name": "ceiling"},
192 |
{"color": [140, 140, 140], "id": 6, "isthing": 0, "name": "road, route"},
193 |
{"color": [204, 5, 255], "id": 7, "isthing": 1, "name": "bed"},
194 |
{"color": [230, 230, 230], "id": 8, "isthing": 1, "name": "window "},
195 |
{"color": [4, 250, 7], "id": 9, "isthing": 0, "name": "grass"},
196 |
{"color": [224, 5, 255], "id": 10, "isthing": 1, "name": "cabinet"},
197 |
{"color": [235, 255, 7], "id": 11, "isthing": 0, "name": "sidewalk, pavement"},
198 |
{"color": [150, 5, 61], "id": 12, "isthing": 1, "name": "person"},
199 |
{"color": [120, 120, 70], "id": 13, "isthing": 0, "name": "earth, ground"},
200 |
{"color": [8, 255, 51], "id": 14, "isthing": 1, "name": "door"},
201 |
{"color": [255, 6, 82], "id": 15, "isthing": 1, "name": "table"},
202 |
{"color": [143, 255, 140], "id": 16, "isthing": 0, "name": "mountain, mount"},
203 |
{"color": [204, 255, 4], "id": 17, "isthing": 0, "name": "plant"},
204 |
{"color": [255, 51, 7], "id": 18, "isthing": 1, "name": "curtain"},
205 |
{"color": [204, 70, 3], "id": 19, "isthing": 1, "name": "chair"},
206 |
{"color": [0, 102, 200], "id": 20, "isthing": 1, "name": "car"},
207 |
{"color": [61, 230, 250], "id": 21, "isthing": 0, "name": "water"},
208 |
{"color": [255, 6, 51], "id": 22, "isthing": 1, "name": "painting, picture"},
209 |
{"color": [11, 102, 255], "id": 23, "isthing": 1, "name": "sofa"},
210 |
{"color": [255, 7, 71], "id": 24, "isthing": 1, "name": "shelf"},
211 |
{"color": [255, 9, 224], "id": 25, "isthing": 0, "name": "house"},
212 |
{"color": [9, 7, 230], "id": 26, "isthing": 0, "name": "sea"},
213 |
{"color": [220, 220, 220], "id": 27, "isthing": 1, "name": "mirror"},
214 |
{"color": [255, 9, 92], "id": 28, "isthing": 0, "name": "rug"},
215 |
{"color": [112, 9, 255], "id": 29, "isthing": 0, "name": "field"},
216 |
{"color": [8, 255, 214], "id": 30, "isthing": 1, "name": "armchair"},
217 |
{"color": [7, 255, 224], "id": 31, "isthing": 1, "name": "seat"},
218 |
{"color": [255, 184, 6], "id": 32, "isthing": 1, "name": "fence"},
219 |
{"color": [10, 255, 71], "id": 33, "isthing": 1, "name": "desk"},
220 |
{"color": [255, 41, 10], "id": 34, "isthing": 0, "name": "rock, stone"},
221 |
{"color": [7, 255, 255], "id": 35, "isthing": 1, "name": "wardrobe, closet, press"},
222 |
{"color": [224, 255, 8], "id": 36, "isthing": 1, "name": "lamp"},
223 |
{"color": [102, 8, 255], "id": 37, "isthing": 1, "name": "tub"},
224 |
{"color": [255, 61, 6], "id": 38, "isthing": 1, "name": "rail"},
225 |
{"color": [255, 194, 7], "id": 39, "isthing": 1, "name": "cushion"},
226 |
{"color": [255, 122, 8], "id": 40, "isthing": 0, "name": "base, pedestal, stand"},
227 |
{"color": [0, 255, 20], "id": 41, "isthing": 1, "name": "box"},
228 |
{"color": [255, 8, 41], "id": 42, "isthing": 1, "name": "column, pillar"},
229 |
{"color": [255, 5, 153], "id": 43, "isthing": 1, "name": "signboard, sign"},
230 |
231 |
"color": [6, 51, 255],
232 |
"id": 44,
233 |
"isthing": 1,
234 |
"name": "chest of drawers, chest, bureau, dresser",
235 |
236 |
{"color": [235, 12, 255], "id": 45, "isthing": 1, "name": "counter"},
237 |
{"color": [160, 150, 20], "id": 46, "isthing": 0, "name": "sand"},
238 |
{"color": [0, 163, 255], "id": 47, "isthing": 1, "name": "sink"},
239 |
{"color": [140, 140, 140], "id": 48, "isthing": 0, "name": "skyscraper"},
240 |
{"color": [250, 10, 15], "id": 49, "isthing": 1, "name": "fireplace"},
241 |
{"color": [20, 255, 0], "id": 50, "isthing": 1, "name": "refrigerator, icebox"},
242 |
{"color": [31, 255, 0], "id": 51, "isthing": 0, "name": "grandstand, covered stand"},
243 |
{"color": [255, 31, 0], "id": 52, "isthing": 0, "name": "path"},
244 |
{"color": [255, 224, 0], "id": 53, "isthing": 1, "name": "stairs"},
245 |
{"color": [153, 255, 0], "id": 54, "isthing": 0, "name": "runway"},
246 |
{"color": [0, 0, 255], "id": 55, "isthing": 1, "name": "case, display case, showcase, vitrine"},
247 |
248 |
"color": [255, 71, 0],
249 |
"id": 56,
250 |
"isthing": 1,
251 |
"name": "pool table, billiard table, snooker table",
252 |
253 |
{"color": [0, 235, 255], "id": 57, "isthing": 1, "name": "pillow"},
254 |
{"color": [0, 173, 255], "id": 58, "isthing": 1, "name": "screen door, screen"},
255 |
{"color": [31, 0, 255], "id": 59, "isthing": 0, "name": "stairway, staircase"},
256 |
{"color": [11, 200, 200], "id": 60, "isthing": 0, "name": "river"},
257 |
{"color": [255, 82, 0], "id": 61, "isthing": 0, "name": "bridge, span"},
258 |
{"color": [0, 255, 245], "id": 62, "isthing": 1, "name": "bookcase"},
259 |
{"color": [0, 61, 255], "id": 63, "isthing": 0, "name": "blind, screen"},
260 |
{"color": [0, 255, 112], "id": 64, "isthing": 1, "name": "coffee table"},
261 |
262 |
"color": [0, 255, 133],
263 |
"id": 65,
264 |
"isthing": 1,
265 |
"name": "toilet, can, commode, crapper, pot, potty, stool, throne",
266 |
267 |
{"color": [255, 0, 0], "id": 66, "isthing": 1, "name": "flower"},
268 |
{"color": [255, 163, 0], "id": 67, "isthing": 1, "name": "book"},
269 |
{"color": [255, 102, 0], "id": 68, "isthing": 0, "name": "hill"},
270 |
{"color": [194, 255, 0], "id": 69, "isthing": 1, "name": "bench"},
271 |
{"color": [0, 143, 255], "id": 70, "isthing": 1, "name": "countertop"},
272 |
{"color": [51, 255, 0], "id": 71, "isthing": 1, "name": "stove"},
273 |
{"color": [0, 82, 255], "id": 72, "isthing": 1, "name": "palm, palm tree"},
274 |
{"color": [0, 255, 41], "id": 73, "isthing": 1, "name": "kitchen island"},
275 |
{"color": [0, 255, 173], "id": 74, "isthing": 1, "name": "computer"},
276 |
{"color": [10, 0, 255], "id": 75, "isthing": 1, "name": "swivel chair"},
277 |
{"color": [173, 255, 0], "id": 76, "isthing": 1, "name": "boat"},
278 |
{"color": [0, 255, 153], "id": 77, "isthing": 0, "name": "bar"},
279 |
{"color": [255, 92, 0], "id": 78, "isthing": 1, "name": "arcade machine"},
280 |
{"color": [255, 0, 255], "id": 79, "isthing": 0, "name": "hovel, hut, hutch, shack, shanty"},
281 |
{"color": [255, 0, 245], "id": 80, "isthing": 1, "name": "bus"},
282 |
{"color": [255, 0, 102], "id": 81, "isthing": 1, "name": "towel"},
283 |
{"color": [255, 173, 0], "id": 82, "isthing": 1, "name": "light"},
284 |
{"color": [255, 0, 20], "id": 83, "isthing": 1, "name": "truck"},
285 |
{"color": [255, 184, 184], "id": 84, "isthing": 0, "name": "tower"},
286 |
{"color": [0, 31, 255], "id": 85, "isthing": 1, "name": "chandelier"},
287 |
{"color": [0, 255, 61], "id": 86, "isthing": 1, "name": "awning, sunshade, sunblind"},
288 |
{"color": [0, 71, 255], "id": 87, "isthing": 1, "name": "street lamp"},
289 |
{"color": [255, 0, 204], "id": 88, "isthing": 1, "name": "booth"},
290 |
{"color": [0, 255, 194], "id": 89, "isthing": 1, "name": "tv"},
291 |
{"color": [0, 255, 82], "id": 90, "isthing": 1, "name": "plane"},
292 |
{"color": [0, 10, 255], "id": 91, "isthing": 0, "name": "dirt track"},
293 |
{"color": [0, 112, 255], "id": 92, "isthing": 1, "name": "clothes"},
294 |
{"color": [51, 0, 255], "id": 93, "isthing": 1, "name": "pole"},
295 |
{"color": [0, 194, 255], "id": 94, "isthing": 0, "name": "land, ground, soil"},
296 |
297 |
"color": [0, 122, 255],
298 |
"id": 95,
299 |
"isthing": 1,
300 |
"name": "bannister, banister, balustrade, balusters, handrail",
301 |
302 |
303 |
"color": [0, 255, 163],
304 |
"id": 96,
305 |
"isthing": 0,
306 |
"name": "escalator, moving staircase, moving stairway",
307 |
308 |
309 |
"color": [255, 153, 0],
310 |
"id": 97,
311 |
"isthing": 1,
312 |
"name": "ottoman, pouf, pouffe, puff, hassock",
313 |
314 |
{"color": [0, 255, 10], "id": 98, "isthing": 1, "name": "bottle"},
315 |
{"color": [255, 112, 0], "id": 99, "isthing": 0, "name": "buffet, counter, sideboard"},
316 |
317 |
"color": [143, 255, 0],
318 |
"id": 100,
319 |
"isthing": 0,
320 |
"name": "poster, posting, placard, notice, bill, card",
321 |
322 |
{"color": [82, 0, 255], "id": 101, "isthing": 0, "name": "stage"},
323 |
{"color": [163, 255, 0], "id": 102, "isthing": 1, "name": "van"},
324 |
{"color": [255, 235, 0], "id": 103, "isthing": 1, "name": "ship"},
325 |
{"color": [8, 184, 170], "id": 104, "isthing": 1, "name": "fountain"},
326 |
327 |
"color": [133, 0, 255],
328 |
"id": 105,
329 |
"isthing": 0,
330 |
"name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
331 |
332 |
{"color": [0, 255, 92], "id": 106, "isthing": 0, "name": "canopy"},
333 |
334 |
"color": [184, 0, 255],
335 |
"id": 107,
336 |
"isthing": 1,
337 |
"name": "washer, automatic washer, washing machine",
338 |
339 |
{"color": [255, 0, 31], "id": 108, "isthing": 1, "name": "plaything, toy"},
340 |
{"color": [0, 184, 255], "id": 109, "isthing": 0, "name": "pool"},
341 |
{"color": [0, 214, 255], "id": 110, "isthing": 1, "name": "stool"},
342 |
{"color": [255, 0, 112], "id": 111, "isthing": 1, "name": "barrel, cask"},
343 |
{"color": [92, 255, 0], "id": 112, "isthing": 1, "name": "basket, handbasket"},
344 |
{"color": [0, 224, 255], "id": 113, "isthing": 0, "name": "falls"},
345 |
{"color": [112, 224, 255], "id": 114, "isthing": 0, "name": "tent"},
346 |
{"color": [70, 184, 160], "id": 115, "isthing": 1, "name": "bag"},
347 |
{"color": [163, 0, 255], "id": 116, "isthing": 1, "name": "minibike, motorbike"},
348 |
{"color": [153, 0, 255], "id": 117, "isthing": 0, "name": "cradle"},
349 |
{"color": [71, 255, 0], "id": 118, "isthing": 1, "name": "oven"},
350 |
{"color": [255, 0, 163], "id": 119, "isthing": 1, "name": "ball"},
351 |
{"color": [255, 204, 0], "id": 120, "isthing": 1, "name": "food, solid food"},
352 |
{"color": [255, 0, 143], "id": 121, "isthing": 1, "name": "step, stair"},
353 |
{"color": [0, 255, 235], "id": 122, "isthing": 0, "name": "tank, storage tank"},
354 |
{"color": [133, 255, 0], "id": 123, "isthing": 1, "name": "trade name"},
355 |
{"color": [255, 0, 235], "id": 124, "isthing": 1, "name": "microwave"},
356 |
{"color": [245, 0, 255], "id": 125, "isthing": 1, "name": "pot"},
357 |
{"color": [255, 0, 122], "id": 126, "isthing": 1, "name": "animal"},
358 |
{"color": [255, 245, 0], "id": 127, "isthing": 1, "name": "bicycle"},
359 |
{"color": [10, 190, 212], "id": 128, "isthing": 0, "name": "lake"},
360 |
{"color": [214, 255, 0], "id": 129, "isthing": 1, "name": "dishwasher"},
361 |
{"color": [0, 204, 255], "id": 130, "isthing": 1, "name": "screen"},
362 |
{"color": [20, 0, 255], "id": 131, "isthing": 0, "name": "blanket, cover"},
363 |
{"color": [255, 255, 0], "id": 132, "isthing": 1, "name": "sculpture"},
364 |
{"color": [0, 153, 255], "id": 133, "isthing": 1, "name": "hood, exhaust hood"},
365 |
{"color": [0, 41, 255], "id": 134, "isthing": 1, "name": "sconce"},
366 |
{"color": [0, 255, 204], "id": 135, "isthing": 1, "name": "vase"},
367 |
{"color": [41, 0, 255], "id": 136, "isthing": 1, "name": "traffic light"},
368 |
{"color": [41, 255, 0], "id": 137, "isthing": 1, "name": "tray"},
369 |
{"color": [173, 0, 255], "id": 138, "isthing": 1, "name": "trash can"},
370 |
{"color": [0, 245, 255], "id": 139, "isthing": 1, "name": "fan"},
371 |
{"color": [71, 0, 255], "id": 140, "isthing": 0, "name": "pier"},
372 |
{"color": [122, 0, 255], "id": 141, "isthing": 0, "name": "crt screen"},
373 |
{"color": [0, 255, 184], "id": 142, "isthing": 1, "name": "plate"},
374 |
{"color": [0, 92, 255], "id": 143, "isthing": 1, "name": "monitor"},
375 |
{"color": [184, 255, 0], "id": 144, "isthing": 1, "name": "bulletin board"},
376 |
{"color": [0, 133, 255], "id": 145, "isthing": 0, "name": "shower"},
377 |
{"color": [255, 214, 0], "id": 146, "isthing": 1, "name": "radiator"},
378 |
{"color": [25, 194, 194], "id": 147, "isthing": 1, "name": "glass, drinking glass"},
379 |
{"color": [102, 255, 0], "id": 148, "isthing": 1, "name": "clock"},
380 |
{"color": [92, 0, 255], "id": 149, "isthing": 1, "name": "flag"},
381 |
382 |
383 |
384 |
{"color": [143, 255, 140], "id": 16, "isthing": 0, "name": "Oculus"},
385 |
{"color": [204, 255, 4], "id": 17, "isthing": 0, "name": "Ukulele"},
386 |
387 |
388 |
389 |
390 |
for i, c in enumerate(COCO_CATEGORIES)
391 |
if c["id"] - 1
392 |
not in [20, 24, 32, 33, 40, 56, 86, 99, 105, 123, 144, 147, 148, 168, 171]
393 |
394 |
395 |
396 |
for i, c in enumerate(COCO_CATEGORIES)
397 |
if c["id"] - 1
398 |
in [20, 24, 32, 33, 40, 56, 86, 99, 105, 123, 144, 147, 148, 168, 171]
399 |
400 |
401 |
402 |
def load_cc_image(csv_file, img_key='filepath', caption_key='title', sep="\t"):
403 |
print(f'Loading csv data from {csv_file}.')
404 |
df = pd.read_csv(csv_file, sep=sep)
405 |
406 |
input_files = df[img_key].tolist()
407 |
captions = df[caption_key].tolist()
408 |
409 |
print("Loaded {} images".format(len(input_files)))
410 |
411 |
dataset_dicts = []
412 |
for (img_path, text) in zip(input_files, captions):
413 |
record = {}
414 |
record["file_name"] = img_path
415 |
record["caption"] = text
416 |
417 |
418 |
return dataset_dicts
419 |
420 |
421 |
def _get_coco_stuff_meta(cat_list):
422 |
# Id 0 is reserved for ignore_label, we change ignore_label for 0
423 |
# to 255 in our pre-processing.
424 |
stuff_ids = [k["id"] for k in cat_list]
425 |
426 |
# For semantic segmentation, this mapping maps from contiguous stuff id
427 |
# (in [0, 91], used in models) to ids in the dataset (used for processing results)
428 |
stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)}
429 |
stuff_classes = [k["name"] for k in cat_list]
430 |
431 |
ret = {
432 |
"stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
433 |
"stuff_classes": stuff_classes,
434 |
435 |
return ret
436 |
437 |
438 |
def register_cc_3m(csv_file):
439 |
440 |
meta = _get_coco_stuff_meta(TEST_CATEGORIES)
441 |
name = "cc_3m_train"
442 |
443 |
444 |
445 |
lambda x=csv_file: load_cc_image(x),
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
# _csv_file = "/home/jeffliang/zsseg/datasets/coco/coco_train_merge_captions.csv"
456 |
_csv_file = "/home/jeffliang/zsseg/configs/masked_images/pred/samples.csv"
457 |
@@ -0,0 +1,250 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
import os
3 |
4 |
from import DatasetCatalog, MetadataCatalog
5 |
from import load_sem_seg
6 |
7 |
8 |
9 |
{"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
10 |
{"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
11 |
{"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
12 |
{"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
13 |
{"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
14 |
{"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
15 |
{"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
16 |
{"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
17 |
{"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
18 |
{"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
19 |
{"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
20 |
{"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
21 |
{"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
22 |
{"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
23 |
{"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
24 |
{"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
25 |
{"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
26 |
{"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
27 |
{"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
28 |
{"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
29 |
{"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
30 |
{"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
31 |
{"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
32 |
{"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
33 |
{"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
34 |
{"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
35 |
{"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
36 |
{"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
37 |
{"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
38 |
{"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
39 |
{"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
40 |
{"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
41 |
{"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
42 |
{"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
43 |
{"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
44 |
{"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
45 |
{"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
46 |
{"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
47 |
{"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
48 |
{"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
49 |
{"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
50 |
{"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
51 |
{"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
52 |
{"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
53 |
{"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
54 |
{"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
55 |
{"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
56 |
{"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
57 |
{"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
58 |
{"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
59 |
{"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
60 |
{"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
61 |
{"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
62 |
{"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
63 |
{"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
64 |
{"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
65 |
{"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
66 |
{"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
67 |
{"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
68 |
{"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
69 |
{"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
70 |
{"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
71 |
{"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
72 |
{"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
73 |
{"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
74 |
{"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
75 |
{"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
76 |
{"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
77 |
{"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
78 |
{"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
79 |
{"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
80 |
{"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
81 |
{"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
82 |
{"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
83 |
{"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
84 |
{"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
85 |
{"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
86 |
{"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
87 |
{"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
88 |
{"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
89 |
{"id": 92, "name": "banner", "supercategory": "textile"},
90 |
{"id": 93, "name": "blanket", "supercategory": "textile"},
91 |
{"id": 94, "name": "branch", "supercategory": "plant"},
92 |
{"id": 95, "name": "bridge", "supercategory": "building"},
93 |
{"id": 96, "name": "building-other", "supercategory": "building"},
94 |
{"id": 97, "name": "bush", "supercategory": "plant"},
95 |
{"id": 98, "name": "cabinet", "supercategory": "furniture-stuff"},
96 |
{"id": 99, "name": "cage", "supercategory": "structural"},
97 |
{"id": 100, "name": "cardboard", "supercategory": "raw-material"},
98 |
{"id": 101, "name": "carpet", "supercategory": "floor"},
99 |
{"id": 102, "name": "ceiling-other", "supercategory": "ceiling"},
100 |
{"id": 103, "name": "ceiling-tile", "supercategory": "ceiling"},
101 |
{"id": 104, "name": "cloth", "supercategory": "textile"},
102 |
{"id": 105, "name": "clothes", "supercategory": "textile"},
103 |
{"id": 106, "name": "clouds", "supercategory": "sky"},
104 |
{"id": 107, "name": "counter", "supercategory": "furniture-stuff"},
105 |
{"id": 108, "name": "cupboard", "supercategory": "furniture-stuff"},
106 |
{"id": 109, "name": "curtain", "supercategory": "textile"},
107 |
{"id": 110, "name": "desk-stuff", "supercategory": "furniture-stuff"},
108 |
{"id": 111, "name": "dirt", "supercategory": "ground"},
109 |
{"id": 112, "name": "door-stuff", "supercategory": "furniture-stuff"},
110 |
{"id": 113, "name": "fence", "supercategory": "structural"},
111 |
{"id": 114, "name": "floor-marble", "supercategory": "floor"},
112 |
{"id": 115, "name": "floor-other", "supercategory": "floor"},
113 |
{"id": 116, "name": "floor-stone", "supercategory": "floor"},
114 |
{"id": 117, "name": "floor-tile", "supercategory": "floor"},
115 |
{"id": 118, "name": "floor-wood", "supercategory": "floor"},
116 |
{"id": 119, "name": "flower", "supercategory": "plant"},
117 |
{"id": 120, "name": "fog", "supercategory": "water"},
118 |
{"id": 121, "name": "food-other", "supercategory": "food-stuff"},
119 |
{"id": 122, "name": "fruit", "supercategory": "food-stuff"},
120 |
{"id": 123, "name": "furniture-other", "supercategory": "furniture-stuff"},
121 |
{"id": 124, "name": "grass", "supercategory": "plant"},
122 |
{"id": 125, "name": "gravel", "supercategory": "ground"},
123 |
{"id": 126, "name": "ground-other", "supercategory": "ground"},
124 |
{"id": 127, "name": "hill", "supercategory": "solid"},
125 |
{"id": 128, "name": "house", "supercategory": "building"},
126 |
{"id": 129, "name": "leaves", "supercategory": "plant"},
127 |
{"id": 130, "name": "light", "supercategory": "furniture-stuff"},
128 |
{"id": 131, "name": "mat", "supercategory": "textile"},
129 |
{"id": 132, "name": "metal", "supercategory": "raw-material"},
130 |
{"id": 133, "name": "mirror-stuff", "supercategory": "furniture-stuff"},
131 |
{"id": 134, "name": "moss", "supercategory": "plant"},
132 |
{"id": 135, "name": "mountain", "supercategory": "solid"},
133 |
{"id": 136, "name": "mud", "supercategory": "ground"},
134 |
{"id": 137, "name": "napkin", "supercategory": "textile"},
135 |
{"id": 138, "name": "net", "supercategory": "structural"},
136 |
{"id": 139, "name": "paper", "supercategory": "raw-material"},
137 |
{"id": 140, "name": "pavement", "supercategory": "ground"},
138 |
{"id": 141, "name": "pillow", "supercategory": "textile"},
139 |
{"id": 142, "name": "plant-other", "supercategory": "plant"},
140 |
{"id": 143, "name": "plastic", "supercategory": "raw-material"},
141 |
{"id": 144, "name": "platform", "supercategory": "ground"},
142 |
{"id": 145, "name": "playingfield", "supercategory": "ground"},
143 |
{"id": 146, "name": "railing", "supercategory": "structural"},
144 |
{"id": 147, "name": "railroad", "supercategory": "ground"},
145 |
{"id": 148, "name": "river", "supercategory": "water"},
146 |
{"id": 149, "name": "road", "supercategory": "ground"},
147 |
{"id": 150, "name": "rock", "supercategory": "solid"},
148 |
{"id": 151, "name": "roof", "supercategory": "building"},
149 |
{"id": 152, "name": "rug", "supercategory": "textile"},
150 |
{"id": 153, "name": "salad", "supercategory": "food-stuff"},
151 |
{"id": 154, "name": "sand", "supercategory": "ground"},
152 |
{"id": 155, "name": "sea", "supercategory": "water"},
153 |
{"id": 156, "name": "shelf", "supercategory": "furniture-stuff"},
154 |
{"id": 157, "name": "sky-other", "supercategory": "sky"},
155 |
{"id": 158, "name": "skyscraper", "supercategory": "building"},
156 |
{"id": 159, "name": "snow", "supercategory": "ground"},
157 |
{"id": 160, "name": "solid-other", "supercategory": "solid"},
158 |
{"id": 161, "name": "stairs", "supercategory": "furniture-stuff"},
159 |
{"id": 162, "name": "stone", "supercategory": "solid"},
160 |
{"id": 163, "name": "straw", "supercategory": "plant"},
161 |
{"id": 164, "name": "structural-other", "supercategory": "structural"},
162 |
{"id": 165, "name": "table", "supercategory": "furniture-stuff"},
163 |
{"id": 166, "name": "tent", "supercategory": "building"},
164 |
{"id": 167, "name": "textile-other", "supercategory": "textile"},
165 |
{"id": 168, "name": "towel", "supercategory": "textile"},
166 |
{"id": 169, "name": "tree", "supercategory": "plant"},
167 |
{"id": 170, "name": "vegetable", "supercategory": "food-stuff"},
168 |
{"id": 171, "name": "wall-brick", "supercategory": "wall"},
169 |
{"id": 172, "name": "wall-concrete", "supercategory": "wall"},
170 |
{"id": 173, "name": "wall-other", "supercategory": "wall"},
171 |
{"id": 174, "name": "wall-panel", "supercategory": "wall"},
172 |
{"id": 175, "name": "wall-stone", "supercategory": "wall"},
173 |
{"id": 176, "name": "wall-tile", "supercategory": "wall"},
174 |
{"id": 177, "name": "wall-wood", "supercategory": "wall"},
175 |
{"id": 178, "name": "water-other", "supercategory": "water"},
176 |
{"id": 179, "name": "waterdrops", "supercategory": "water"},
177 |
{"id": 180, "name": "window-blind", "supercategory": "window"},
178 |
{"id": 181, "name": "window-other", "supercategory": "window"},
179 |
{"id": 182, "name": "wood", "supercategory": "solid"},
180 |
181 |
182 |
def _get_coco_stuff_meta(cat_list):
183 |
# Id 0 is reserved for ignore_label, we change ignore_label for 0
184 |
# to 255 in our pre-processing.
185 |
stuff_ids = [k["id"] for k in cat_list]
186 |
187 |
# For semantic segmentation, this mapping maps from contiguous stuff id
188 |
# (in [0, 91], used in models) to ids in the dataset (used for processing results)
189 |
stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)}
190 |
stuff_classes = [k["name"] for k in cat_list]
191 |
192 |
ret = {
193 |
"stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
194 |
"stuff_classes": stuff_classes,
195 |
196 |
return ret
197 |
198 |
199 |
def register_all_coco_stuff_10k(root):
200 |
root = os.path.join(root, "coco", "coco_stuff_10k")
201 |
meta = _get_coco_stuff_meta(COCO_CATEGORIES)
202 |
for name, image_dirname, sem_seg_dirname in [
203 |
("train", "images_detectron2/train", "annotations_detectron2/train"),
204 |
205 |
image_dir = os.path.join(root, image_dirname)
206 |
gt_dir = os.path.join(root, sem_seg_dirname)
207 |
name = f"coco_2017_{name}_stuff_10k_sem_seg"
208 |
209 |
210 |
lambda x=image_dir, y=gt_dir: load_sem_seg(
211 |
y, x, gt_ext="png", image_ext="jpg"
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
def register_all_coco_stuff(root):
224 |
root = os.path.join(root, "coco")
225 |
meta = _get_coco_stuff_meta(COCO_CATEGORIES)
226 |
227 |
for name, image_dirname, sem_seg_dirname in [
228 |
("train", "train2017", "stuffthingmaps_detectron2/train2017"),
229 |
230 |
image_dir = os.path.join(root, image_dirname)
231 |
gt_dir = os.path.join(root, sem_seg_dirname)
232 |
all_name = f"coco_2017_{name}_stuff_sem_seg"
233 |
234 |
235 |
lambda x=image_dir, y=gt_dir: load_sem_seg(
236 |
y, x, gt_ext="png", image_ext="jpg"
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
249 |
250 |
@@ -0,0 +1,588 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
import os
3 |
4 |
from import DatasetCatalog, MetadataCatalog
5 |
from import load_sem_seg
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
"air conditioner",
73 |
74 |
75 |
76 |
77 |
"baby carriage",
78 |
79 |
80 |
81 |
"bamboo weaving",
82 |
83 |
"baseball bat",
84 |
85 |
"basketball backboard",
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
"bird cage",
96 |
"bird feeder",
97 |
"bird nest",
98 |
99 |
100 |
101 |
102 |
103 |
104 |
"bottle opener",
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
"cabinet door",
117 |
118 |
119 |
120 |
121 |
122 |
123 |
"camera lens",
124 |
125 |
126 |
"candle holder",
127 |
128 |
129 |
130 |
131 |
132 |
"casette recorder",
133 |
"cash register",
134 |
135 |
136 |
"cd player",
137 |
138 |
"cell phone",
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
"clothes tree",
151 |
152 |
"coffee machine",
153 |
154 |
155 |
156 |
157 |
158 |
"control booth",
159 |
160 |
161 |
"copying machine",
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
"cutting board",
177 |
178 |
179 |
"disc case",
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
"drink dispenser",
188 |
"drinking machine",
189 |
190 |
191 |
192 |
"drum kit",
193 |
194 |
195 |
196 |
197 |
198 |
"electric fan",
199 |
"electric iron",
200 |
"electric pot",
201 |
"electric saw",
202 |
"electronic keyboard",
203 |
204 |
205 |
206 |
207 |
"exhibition booth",
208 |
209 |
210 |
211 |
212 |
"fax machine",
213 |
214 |
"ferris wheel",
215 |
"fire extinguisher",
216 |
"fire hydrant",
217 |
"fire place",
218 |
219 |
"fish tank",
220 |
221 |
"fishing net",
222 |
"fishing pole",
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
"game controller",
245 |
"game machine",
246 |
"gas cylinder",
247 |
"gas hood",
248 |
"gas stove",
249 |
"gift box",
250 |
251 |
"glass marble",
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
"hand cart",
264 |
265 |
266 |
267 |
"hard disk drive",
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
"horse-drawn carriage",
278 |
"hot-air balloon",
279 |
280 |
281 |
"inflator pump",
282 |
283 |
284 |
"ironing board",
285 |
286 |
287 |
288 |
289 |
290 |
"kitchen range",
291 |
292 |
293 |
"knife block",
294 |
295 |
"ladder truck",
296 |
297 |
298 |
299 |
300 |
"life buoy",
301 |
302 |
"light bulb",
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
"match book",
315 |
316 |
317 |
318 |
"meter box",
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
"mouse pad",
331 |
"musical instrument",
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
"oxygen bottle",
340 |
341 |
342 |
343 |
"paper box",
344 |
"paper cutter",
345 |
346 |
347 |
348 |
349 |
350 |
351 |
"pen container",
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
"poker chip",
372 |
373 |
"pool table",
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
"range hood",
389 |
390 |
391 |
"recreational machines",
392 |
"remote control",
393 |
394 |
395 |
396 |
397 |
"rocking horse",
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
"sewing machine",
415 |
416 |
417 |
418 |
419 |
420 |
"shopping cart",
421 |
422 |
423 |
424 |
425 |
"signal light",
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
"speed bump",
442 |
"spice container",
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
"sticky note",
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
"surveillance camera",
460 |
461 |
462 |
"swim ring",
463 |
"swimming pool",
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
"telephone booth",
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
"toy car",
484 |
485 |
486 |
487 |
"trash bin",
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
"vacuum cleaner",
502 |
"vending machine",
503 |
"video camera",
504 |
"video game console",
505 |
"video player",
506 |
"video tape",
507 |
508 |
509 |
510 |
511 |
512 |
"washing machine",
513 |
514 |
515 |
"water dispenser",
516 |
"water pipe",
517 |
"water skate board",
518 |
519 |
520 |
521 |
522 |
523 |
524 |
"window blinds",
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
def _get_voc_meta(cat_list):
534 |
ret = {
535 |
"stuff_classes": cat_list,
536 |
537 |
return ret
538 |
539 |
540 |
def register_pascal_context_59(root):
541 |
root = os.path.join(root, "VOCdevkit/VOC2010")
542 |
meta = _get_voc_meta(PASCALCONTEX59_NAMES)
543 |
for name, image_dirname, sem_seg_dirname in [
544 |
("val", "JPEGImages", "annotations_detectron2/pc59_val"),
545 |
546 |
image_dir = os.path.join(root, image_dirname)
547 |
gt_dir = os.path.join(root, sem_seg_dirname)
548 |
all_name = f"pascal_context_59_sem_seg_{name}"
549 |
550 |
551 |
lambda x=image_dir, y=gt_dir: load_sem_seg(
552 |
y, x, gt_ext="png", image_ext="jpg"
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
def register_pascal_context_459(root):
564 |
root = os.path.join(root, "VOCdevkit/VOC2010")
565 |
meta = _get_voc_meta(PASCALCONTEX459_NAMES)
566 |
for name, image_dirname, sem_seg_dirname in [
567 |
("val", "JPEGImages", "annotations_detectron2/pc459_val"),
568 |
569 |
image_dir = os.path.join(root, image_dirname)
570 |
gt_dir = os.path.join(root, sem_seg_dirname)
571 |
all_name = f"pascal_context_459_sem_seg_{name}"
572 |
573 |
574 |
lambda x=image_dir, y=gt_dir: load_sem_seg(
575 |
y, x, gt_ext="tif", image_ext="jpg"
576 |
577 |
578 |
579 |
580 |
581 |
582 |
ignore_label=65535, # NOTE: gt is saved in 16-bit TIFF images
583 |
584 |
585 |
586 |
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
587 |
588 |
@@ -0,0 +1,62 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
import os
3 |
4 |
from import DatasetCatalog, MetadataCatalog
5 |
from import load_sem_seg
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
def _get_voc_meta(cat_list):
31 |
ret = {
32 |
"stuff_classes": cat_list,
33 |
34 |
return ret
35 |
36 |
37 |
def register_pascalvoc(root):
38 |
root = os.path.join(root, "VOCdevkit/VOC2012")
39 |
meta = _get_voc_meta(PASCALVOC20_NAMES)
40 |
41 |
for name, image_dirname, sem_seg_dirname in [
42 |
("val", "JPEGImages", "annotations_detectron2/val"),
43 |
44 |
image_dir = os.path.join(root, image_dirname)
45 |
gt_dir = os.path.join(root, sem_seg_dirname)
46 |
all_name = f"pascalvoc20_sem_seg_{name}"
47 |
48 |
49 |
lambda x=image_dir, y=gt_dir: load_sem_seg(
50 |
y, x, gt_ext="png", image_ext="jpg"
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
62 |
@@ -0,0 +1,4 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
from .generalized_sem_seg_evaluation import GeneralizedSemSegEvaluator
@@ -0,0 +1,159 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
import itertools
5 |
import json
6 |
import numpy as np
7 |
import os
8 |
from collections import OrderedDict
9 |
import PIL.Image as Image
10 |
import torch
11 |
12 |
from import DatasetCatalog, MetadataCatalog
13 |
from detectron2.utils.comm import all_gather, is_main_process, synchronize
14 |
from detectron2.utils.file_io import PathManager
15 |
16 |
from detectron2.evaluation import SemSegEvaluator
17 |
18 |
19 |
class GeneralizedSemSegEvaluator(SemSegEvaluator):
20 |
21 |
Evaluate semantic segmentation metrics.
22 |
23 |
24 |
def __init__(
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
meta = MetadataCatalog.get(dataset_name)
42 |
43 |
self._evaluation_set = meta.evaluation_set
44 |
except AttributeError:
45 |
self._evaluation_set = None
46 |
self.post_process_func = (
47 |
48 |
if post_process_func is not None
49 |
else lambda x, **kwargs: x
50 |
51 |
52 |
def process(self, inputs, outputs):
53 |
54 |
55 |
inputs: the inputs to a model.
56 |
It is a list of dicts. Each dict corresponds to an image and
57 |
contains keys like "height", "width", "file_name".
58 |
outputs: the outputs of a model. It is either list of semantic segmentation predictions
59 |
(Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic
60 |
segmentation prediction in the same format.
61 |
62 |
for input, output in zip(inputs, outputs):
63 |
output = self.post_process_func(
64 |
output["sem_seg"], image=np.array(["file_name"]))
65 |
66 |
output = output.argmax(dim=0).to(self._cpu_device)
67 |
pred = np.array(output,
68 |
69 |
self.input_file_to_gt_file[input["file_name"]], "rb"
70 |
) as f:
71 |
gt = np.array(,
72 |
73 |
gt[gt == self._ignore_label] = self._num_classes
74 |
75 |
self._conf_matrix += np.bincount(
76 |
(self._num_classes + 1) * pred.reshape(-1) + gt.reshape(-1),
77 |
78 |
79 |
80 |
self._predictions.extend(self.encode_json_sem_seg(pred, input["file_name"]))
81 |
82 |
def evaluate(self):
83 |
84 |
Evaluates standard semantic segmentation metrics (
85 |
86 |
* Mean intersection-over-union averaged across classes (mIoU)
87 |
* Frequency Weighted IoU (fwIoU)
88 |
* Mean pixel accuracy averaged across classes (mACC)
89 |
* Pixel Accuracy (pACC)
90 |
91 |
if self._distributed:
92 |
93 |
conf_matrix_list = all_gather(self._conf_matrix)
94 |
self._predictions = all_gather(self._predictions)
95 |
self._predictions = list(itertools.chain(*self._predictions))
96 |
if not is_main_process():
97 |
98 |
99 |
self._conf_matrix = np.zeros_like(self._conf_matrix)
100 |
for conf_matrix in conf_matrix_list:
101 |
self._conf_matrix += conf_matrix
102 |
103 |
if self._output_dir:
104 |
105 |
file_path = os.path.join(self._output_dir, "sem_seg_predictions.json")
106 |
with, "w") as f:
107 |
108 |
109 |
acc = np.full(self._num_classes, np.nan, dtype=np.float)
110 |
iou = np.full(self._num_classes, np.nan, dtype=np.float)
111 |
tp = self._conf_matrix.diagonal()[:-1].astype(np.float)
112 |
pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float)
113 |
class_weights = pos_gt / np.sum(pos_gt)
114 |
pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float)
115 |
acc_valid = pos_gt > 0
116 |
acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid]
117 |
iou_valid = (pos_gt + pos_pred) > 0
118 |
union = pos_gt + pos_pred - tp
119 |
iou[acc_valid] = tp[acc_valid] / union[acc_valid]
120 |
macc = np.sum(acc[acc_valid]) / np.sum(acc_valid)
121 |
miou = np.sum(iou[acc_valid]) / np.sum(iou_valid)
122 |
fiou = np.sum(iou[acc_valid] * class_weights[acc_valid])
123 |
pacc = np.sum(tp) / np.sum(pos_gt)
124 |
125 |
res = {}
126 |
res["mIoU"] = 100 * miou
127 |
res["fwIoU"] = 100 * fiou
128 |
for i, name in enumerate(self._class_names):
129 |
res["IoU-{}".format(name)] = 100 * iou[i]
130 |
res["mACC"] = 100 * macc
131 |
res["pACC"] = 100 * pacc
132 |
for i, name in enumerate(self._class_names):
133 |
res["ACC-{}".format(name)] = 100 * acc[i]
134 |
if self._evaluation_set is not None:
135 |
for set_name, set_inds in self._evaluation_set.items():
136 |
iou_list = []
137 |
set_inds = np.array(set_inds,
138 |
mask = np.zeros((len(iou),)).astype(np.bool)
139 |
mask[set_inds] = 1
140 |
miou = np.sum(iou[mask][acc_valid[mask]]) / np.sum(iou_valid[mask])
141 |
pacc = np.sum(tp[mask]) / np.sum(pos_gt[mask])
142 |
res["mIoU-{}".format(set_name)] = 100 * miou
143 |
res["pAcc-{}".format(set_name)] = 100 * pacc
144 |
145 |
miou = np.sum(iou[~mask][acc_valid[~mask]]) / np.sum(iou_valid[~mask])
146 |
pacc = np.sum(tp[~mask]) / np.sum(pos_gt[~mask])
147 |
res["mIoU-un{}".format(set_name)] = 100 * miou
148 |
res["pAcc-un{}".format(set_name)] = 100 * pacc
149 |
150 |
res["hIoU-{}".format(set_name)] = (
151 |
100 * len(iou_list) / sum([1 / iou for iou in iou_list])
152 |
153 |
if self._output_dir:
154 |
file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth")
155 |
with, "wb") as f:
156 |
+, f)
157 |
results = OrderedDict({"sem_seg": res})
158 |
159 |
return results
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
from typing import Tuple
5 |
6 |
import torch
7 |
from torch import nn
8 |
from torch.nn import functional as F
9 |
10 |
from detectron2.config import configurable
11 |
from import MetadataCatalog
12 |
from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head
13 |
from detectron2.modeling.backbone import Backbone
14 |
from detectron2.modeling.postprocessing import sem_seg_postprocess
15 |
from detectron2.structures import ImageList
16 |
17 |
from .modeling.criterion import SetCriterion
18 |
from .modeling.matcher import HungarianMatcher
19 |
20 |
21 |
22 |
class MaskFormer(nn.Module):
23 |
24 |
Main class for mask classification semantic segmentation architectures.
25 |
26 |
27 |
28 |
def __init__(
29 |
30 |
31 |
backbone: Backbone,
32 |
sem_seg_head: nn.Module,
33 |
criterion: nn.Module,
34 |
num_queries: int,
35 |
panoptic_on: bool,
36 |
object_mask_threshold: float,
37 |
overlap_threshold: float,
38 |
39 |
size_divisibility: int,
40 |
sem_seg_postprocess_before_inference: bool,
41 |
pixel_mean: Tuple[float],
42 |
pixel_std: Tuple[float],
43 |
44 |
45 |
46 |
backbone: a backbone module, must follow detectron2's backbone interface
47 |
sem_seg_head: a module that predicts semantic segmentation from backbone features
48 |
criterion: a module that defines the loss
49 |
num_queries: int, number of queries
50 |
panoptic_on: bool, whether to output panoptic segmentation prediction
51 |
object_mask_threshold: float, threshold to filter query based on classification score
52 |
for panoptic segmentation inference
53 |
overlap_threshold: overlap threshold used in general inference for panoptic segmentation
54 |
metadata: dataset meta, get `thing` and `stuff` category names for panoptic
55 |
segmentation inference
56 |
size_divisibility: Some backbones require the input height and width to be divisible by a
57 |
specific integer. We can use this to override such requirement.
58 |
sem_seg_postprocess_before_inference: whether to resize the prediction back
59 |
to original input size before semantic segmentation inference or after.
60 |
For high-resolution dataset like Mapillary, resizing predictions before
61 |
inference will cause OOM error.
62 |
pixel_mean, pixel_std: list or tuple with #channels element, representing
63 |
the per-channel mean and std to be used to normalize the input image
64 |
65 |
66 |
self.backbone = backbone
67 |
self.sem_seg_head = sem_seg_head
68 |
self.criterion = criterion
69 |
self.num_queries = num_queries
70 |
self.overlap_threshold = overlap_threshold
71 |
self.panoptic_on = panoptic_on
72 |
self.object_mask_threshold = object_mask_threshold
73 |
self.metadata = metadata
74 |
if size_divisibility < 0:
75 |
# use backbone size_divisibility if not set
76 |
size_divisibility = self.backbone.size_divisibility
77 |
self.size_divisibility = size_divisibility
78 |
self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference
79 |
80 |
"pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False
81 |
82 |
self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
83 |
84 |
85 |
def from_config(cls, cfg):
86 |
backbone = build_backbone(cfg)
87 |
sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
88 |
89 |
# Loss parameters:
90 |
91 |
92 |
93 |
94 |
95 |
# building criterion
96 |
matcher = HungarianMatcher(
97 |
98 |
99 |
100 |
101 |
102 |
weight_dict = {"loss_ce": 1, "loss_mask": mask_weight, "loss_dice": dice_weight}
103 |
if deep_supervision:
104 |
105 |
aux_weight_dict = {}
106 |
for i in range(dec_layers - 1):
107 |
aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
108 |
109 |
110 |
losses = ["labels", "masks"]
111 |
112 |
criterion = SetCriterion(
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
return {
121 |
"backbone": backbone,
122 |
"sem_seg_head": sem_seg_head,
123 |
"criterion": criterion,
124 |
125 |
126 |
127 |
128 |
"metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
129 |
130 |
"sem_seg_postprocess_before_inference": (
131 |
132 |
133 |
134 |
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
135 |
"pixel_std": cfg.MODEL.PIXEL_STD,
136 |
137 |
138 |
139 |
def device(self):
140 |
return self.pixel_mean.device
141 |
142 |
def forward(self, batched_inputs):
143 |
144 |
145 |
batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
146 |
Each item in the list contains the inputs for one image.
147 |
For now, each item in the list is a dict that contains:
148 |
* "image": Tensor, image in (C, H, W) format.
149 |
* "instances": per-region ground truth
150 |
* Other information that's included in the original dicts, such as:
151 |
"height", "width" (int): the output resolution of the model (may be different
152 |
from input resolution), used in inference.
153 |
154 |
155 |
each dict has the results for one image. The dict contains the following keys:
156 |
157 |
* "sem_seg":
158 |
A Tensor that represents the
159 |
per-pixel segmentation prediced by the head.
160 |
The prediction has shape KxHxW that represents the logits of
161 |
each class for each pixel.
162 |
* "panoptic_seg":
163 |
A tuple that represent panoptic output
164 |
panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
165 |
segments_info (list[dict]): Describe each segment in `panoptic_seg`.
166 |
Each dict contains keys "id", "category_id", "isthing".
167 |
168 |
images = [x["image"].to(self.device) for x in batched_inputs]
169 |
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
170 |
images = ImageList.from_tensors(images, self.size_divisibility)
171 |
172 |
features = self.backbone(images.tensor)
173 |
outputs = self.sem_seg_head(features)
174 |
175 |
176 |
# mask classification target
177 |
if "instances" in batched_inputs[0]:
178 |
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
179 |
targets = self.prepare_targets(gt_instances, images)
180 |
181 |
targets = None
182 |
183 |
# bipartite matching-based loss
184 |
losses = self.criterion(outputs, targets)
185 |
186 |
for k in list(losses.keys()):
187 |
if k in self.criterion.weight_dict:
188 |
losses[k] *= self.criterion.weight_dict[k]
189 |
190 |
# remove this loss if not specified in `weight_dict`
191 |
192 |
193 |
return losses
194 |
195 |
mask_cls_results = outputs["pred_logits"]
196 |
mask_pred_results = outputs["pred_masks"]
197 |
# upsample masks
198 |
mask_pred_results = F.interpolate(
199 |
200 |
size=(images.tensor.shape[-2], images.tensor.shape[-1]),
201 |
202 |
203 |
204 |
205 |
processed_results = []
206 |
for mask_cls_result, mask_pred_result, input_per_image, image_size in zip(
207 |
mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes
208 |
209 |
height = input_per_image.get("height", image_size[0])
210 |
width = input_per_image.get("width", image_size[1])
211 |
212 |
if self.sem_seg_postprocess_before_inference:
213 |
mask_pred_result = sem_seg_postprocess(
214 |
mask_pred_result, image_size, height, width
215 |
216 |
217 |
# semantic segmentation inference
218 |
r = self.semantic_inference(mask_cls_result, mask_pred_result)
219 |
if not self.sem_seg_postprocess_before_inference:
220 |
r = sem_seg_postprocess(r, image_size, height, width)
221 |
processed_results.append({"sem_seg": r})
222 |
223 |
# panoptic segmentation inference
224 |
if self.panoptic_on:
225 |
panoptic_r = self.panoptic_inference(
226 |
mask_cls_result, mask_pred_result
227 |
228 |
processed_results[-1]["panoptic_seg"] = panoptic_r
229 |
230 |
return processed_results
231 |
232 |
def prepare_targets(self, targets, images):
233 |
h, w = images.tensor.shape[-2:]
234 |
new_targets = []
235 |
for targets_per_image in targets:
236 |
# pad gt
237 |
gt_masks = targets_per_image.gt_masks
238 |
padded_masks = torch.zeros(
239 |
(gt_masks.shape[0], h, w), dtype=gt_masks.dtype, device=gt_masks.device
240 |
241 |
padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks
242 |
243 |
244 |
"labels": targets_per_image.gt_classes,
245 |
"masks": padded_masks,
246 |
247 |
248 |
return new_targets
249 |
250 |
def semantic_inference(self, mask_cls, mask_pred):
251 |
mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
252 |
mask_pred = mask_pred.sigmoid()
253 |
semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
254 |
return semseg
Binary file (6.15 kB). View file
@@ -0,0 +1,8 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
from .backbone.swin import D2SwinTransformer
5 |
from .backbone.clip_resnet import D2ModifiedResNet
6 |
from .heads.mask_former_head import MaskFormerHead
7 |
from .heads.open_vocab_mask_former_head import OpenVocabMaskFormerHead
8 |
from .heads.pixel_decoder import BasePixelDecoder
@@ -0,0 +1,2 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
@@ -0,0 +1,206 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
from collections import OrderedDict
5 |
import torch
6 |
import torch.nn as nn
7 |
from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
8 |
9 |
10 |
class Bottleneck(nn.Module):
11 |
expansion = 4
12 |
13 |
def __init__(self, inplanes, planes, stride=1, dilation=1):
14 |
15 |
16 |
# all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
17 |
self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
18 |
self.bn1 = nn.BatchNorm2d(planes)
19 |
20 |
self.conv2 = nn.Conv2d(
21 |
planes, planes, 3, padding=1 * dilation, bias=False, dilation=dilation
22 |
23 |
self.bn2 = nn.BatchNorm2d(planes)
24 |
25 |
self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
26 |
27 |
self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
28 |
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
29 |
30 |
self.relu = nn.ReLU(inplace=True)
31 |
self.downsample = None
32 |
self.stride = stride
33 |
34 |
if stride > 1 or inplanes != planes * Bottleneck.expansion:
35 |
# downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
36 |
self.downsample = nn.Sequential(
37 |
38 |
39 |
("-1", nn.AvgPool2d(stride)),
40 |
41 |
42 |
43 |
44 |
planes * self.expansion,
45 |
46 |
47 |
48 |
49 |
50 |
("1", nn.BatchNorm2d(planes * self.expansion)),
51 |
52 |
53 |
54 |
55 |
def forward(self, x: torch.Tensor):
56 |
identity = x
57 |
58 |
out = self.relu(self.bn1(self.conv1(x)))
59 |
out = self.relu(self.bn2(self.conv2(out)))
60 |
out = self.avgpool(out)
61 |
out = self.bn3(self.conv3(out))
62 |
63 |
if self.downsample is not None:
64 |
identity = self.downsample(x)
65 |
66 |
out += identity
67 |
out = self.relu(out)
68 |
return out
69 |
70 |
71 |
class ModifiedResNet(nn.Module):
72 |
73 |
A ResNet class that is similar to torchvision's but contains the following changes:
74 |
- There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
75 |
- Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
76 |
- The final pooling layer is a QKV attention instead of an average pool
77 |
78 |
79 |
def __init__(self, layers, width=64, strides=[2, 1, 2, 2, 2], multi_grid=[1, 1, 1]):
80 |
81 |
82 |
# the 3-layer stem
83 |
self.conv1 = nn.Conv2d(
84 |
3, width // 2, kernel_size=3, stride=2, padding=1, bias=False
85 |
86 |
self.bn1 = nn.BatchNorm2d(width // 2)
87 |
self.conv2 = nn.Conv2d(
88 |
width // 2, width // 2, kernel_size=3, padding=1, bias=False
89 |
90 |
self.bn2 = nn.BatchNorm2d(width // 2)
91 |
self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
92 |
self.bn3 = nn.BatchNorm2d(width)
93 |
self.avgpool = nn.AvgPool2d(strides[0]) if strides[0] > 1 else nn.Identity()
94 |
self.relu = nn.ReLU(inplace=True)
95 |
96 |
# residual layers
97 |
self._inplanes = width # this is a *mutable* variable used during construction
98 |
self.layer1 = self._make_layer(width, layers[0], stride=strides[1])
99 |
self.layer2 = self._make_layer(width * 2, layers[1], stride=strides[2])
100 |
self.layer3 = self._make_layer(width * 4, layers[2], stride=strides[3])
101 |
self.layer4 = self._make_layer(
102 |
width * 8, layers[3], stride=strides[4], dilations=multi_grid
103 |
104 |
self.num_features = [width * 4, width * 8, width * 16, width * 32]
105 |
106 |
def _make_layer(self, planes, blocks, stride=1, dilations=None):
107 |
if dilations is None:
108 |
dilations = [1] * blocks
109 |
layers = [Bottleneck(self._inplanes, planes, stride, dilation=dilations[0])]
110 |
self._inplanes = planes * Bottleneck.expansion
111 |
112 |
for i in range(1, blocks):
113 |
layers.append(Bottleneck(self._inplanes, planes, dilation=dilations[i]))
114 |
115 |
return nn.Sequential(*layers)
116 |
117 |
def forward(self, x):
118 |
def stem(x):
119 |
for conv, bn in [
120 |
(self.conv1, self.bn1),
121 |
(self.conv2, self.bn2),
122 |
(self.conv3, self.bn3),
123 |
124 |
x = self.relu(bn(conv(x)))
125 |
x = self.avgpool(x)
126 |
return x
127 |
128 |
output = {}
129 |
x = x.type(self.conv1.weight.dtype)
130 |
x = stem(x) # 1/4,1/4
131 |
x = self.layer1(x)
132 |
output["res2"] = x
133 |
x = self.layer2(x) # 1/8,1/8
134 |
output["res3"] = x
135 |
x = self.layer3(x) # 1/16,1/16
136 |
output["res4"] = x
137 |
x = self.layer4(x) # 1/32,1/32
138 |
output["res5"] = x
139 |
return output
140 |
141 |
142 |
143 |
class D2ModifiedResNet(ModifiedResNet, Backbone):
144 |
def __init__(self, cfg, input_shape):
145 |
146 |
147 |
width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
148 |
bottleneck_channels = num_groups * width_per_group
149 |
num_blocks_per_stage = {
150 |
18: [2, 2, 2, 2],
151 |
34: [3, 4, 6, 3],
152 |
50: [3, 4, 6, 3],
153 |
101: [3, 4, 23, 3],
154 |
152: [3, 8, 36, 3],
155 |
156 |
strides = [2, 1, 2, 2, 2]
157 |
158 |
if cfg.MODEL.RESNETS.STEM_TYPE == "deeplab":
159 |
strides = [1, 1, 2, 2, 2]
160 |
161 |
162 |
163 |
164 |
165 |
166 |
self._out_features = cfg.MODEL.RESNETS.OUT_FEATURES
167 |
168 |
self._out_feature_strides = {
169 |
"res2": 4,
170 |
"res3": 8,
171 |
"res4": 16,
172 |
"res5": 32,
173 |
174 |
self._out_feature_channels = {
175 |
"res2": self.num_features[0],
176 |
"res3": self.num_features[1],
177 |
"res4": self.num_features[2],
178 |
"res5": self.num_features[3],
179 |
180 |
181 |
def forward(self, x):
182 |
183 |
184 |
x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
185 |
186 |
dict[str->Tensor]: names and the corresponding features
187 |
188 |
outputs = {}
189 |
y = super().forward(x)
190 |
for k in y.keys():
191 |
if k in self._out_features:
192 |
outputs[k] = y[k]
193 |
return outputs
194 |
195 |
def output_shape(self):
196 |
return {
197 |
name: ShapeSpec(
198 |
199 |
200 |
201 |
for name in self._out_features
202 |
203 |
204 |
205 |
def size_divisibility(self):
206 |
return 32
@@ -0,0 +1,832 @@
1 |
# --------------------------------------------------------
2 |
# Swin Transformer
3 |
# Copyright (c) 2021 Microsoft
4 |
# Licensed under The MIT License [see LICENSE for details]
5 |
# Written by Ze Liu, Yutong Lin, Yixuan Wei
6 |
# --------------------------------------------------------
7 |
8 |
# Copyright (c) Facebook, Inc. and its affiliates.
9 |
# Modified by Bowen Cheng from
10 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
11 |
12 |
import numpy as np
13 |
import torch
14 |
import torch.nn as nn
15 |
import torch.nn.functional as F
16 |
import torch.utils.checkpoint as checkpoint
17 |
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
18 |
19 |
from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
20 |
21 |
22 |
class Mlp(nn.Module):
23 |
"""Multilayer perceptron."""
24 |
25 |
def __init__(
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
out_features = out_features or in_features
35 |
hidden_features = hidden_features or in_features
36 |
self.fc1 = nn.Linear(in_features, hidden_features)
37 |
self.act = act_layer()
38 |
self.fc2 = nn.Linear(hidden_features, out_features)
39 |
self.drop = nn.Dropout(drop)
40 |
41 |
def forward(self, x):
42 |
x = self.fc1(x)
43 |
x = self.act(x)
44 |
x = self.drop(x)
45 |
x = self.fc2(x)
46 |
x = self.drop(x)
47 |
return x
48 |
49 |
50 |
def window_partition(x, window_size):
51 |
52 |
53 |
x: (B, H, W, C)
54 |
window_size (int): window size
55 |
56 |
windows: (num_windows*B, window_size, window_size, C)
57 |
58 |
B, H, W, C = x.shape
59 |
x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
60 |
windows = (
61 |
x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
62 |
63 |
return windows
64 |
65 |
66 |
def window_reverse(windows, window_size, H, W):
67 |
68 |
69 |
windows: (num_windows*B, window_size, window_size, C)
70 |
window_size (int): Window size
71 |
H (int): Height of image
72 |
W (int): Width of image
73 |
74 |
x: (B, H, W, C)
75 |
76 |
B = int(windows.shape[0] / (H * W / window_size / window_size))
77 |
x = windows.view(
78 |
B, H // window_size, W // window_size, window_size, window_size, -1
79 |
80 |
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
81 |
return x
82 |
83 |
84 |
class WindowAttention(nn.Module):
85 |
"""Window based multi-head self attention (W-MSA) module with relative position bias.
86 |
It supports both of shifted and non-shifted window.
87 |
88 |
dim (int): Number of input channels.
89 |
window_size (tuple[int]): The height and width of the window.
90 |
num_heads (int): Number of attention heads.
91 |
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
92 |
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
93 |
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
94 |
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
95 |
96 |
97 |
def __init__(
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
self.dim = dim
110 |
self.window_size = window_size # Wh, Ww
111 |
self.num_heads = num_heads
112 |
head_dim = dim // num_heads
113 |
self.scale = qk_scale or head_dim ** -0.5
114 |
115 |
# define a parameter table of relative position bias
116 |
self.relative_position_bias_table = nn.Parameter(
117 |
torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
118 |
) # 2*Wh-1 * 2*Ww-1, nH
119 |
120 |
# get pair-wise relative position index for each token inside the window
121 |
coords_h = torch.arange(self.window_size[0])
122 |
coords_w = torch.arange(self.window_size[1])
123 |
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
124 |
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
125 |
relative_coords = (
126 |
coords_flatten[:, :, None] - coords_flatten[:, None, :]
127 |
) # 2, Wh*Ww, Wh*Ww
128 |
relative_coords = relative_coords.permute(
129 |
1, 2, 0
130 |
).contiguous() # Wh*Ww, Wh*Ww, 2
131 |
relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
132 |
relative_coords[:, :, 1] += self.window_size[1] - 1
133 |
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
134 |
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
135 |
self.register_buffer("relative_position_index", relative_position_index)
136 |
137 |
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
138 |
self.attn_drop = nn.Dropout(attn_drop)
139 |
self.proj = nn.Linear(dim, dim)
140 |
self.proj_drop = nn.Dropout(proj_drop)
141 |
142 |
trunc_normal_(self.relative_position_bias_table, std=0.02)
143 |
self.softmax = nn.Softmax(dim=-1)
144 |
145 |
def forward(self, x, mask=None):
146 |
"""Forward function.
147 |
148 |
x: input features with shape of (num_windows*B, N, C)
149 |
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
150 |
151 |
B_, N, C = x.shape
152 |
qkv = (
153 |
154 |
.reshape(B_, N, 3, self.num_heads, C // self.num_heads)
155 |
.permute(2, 0, 3, 1, 4)
156 |
157 |
q, k, v = (
158 |
159 |
160 |
161 |
) # make torchscript happy (cannot use tensor as tuple)
162 |
163 |
q = q * self.scale
164 |
attn = q @ k.transpose(-2, -1)
165 |
166 |
relative_position_bias = self.relative_position_bias_table[
167 |
168 |
169 |
self.window_size[0] * self.window_size[1],
170 |
self.window_size[0] * self.window_size[1],
171 |
172 |
) # Wh*Ww,Wh*Ww,nH
173 |
relative_position_bias = relative_position_bias.permute(
174 |
2, 0, 1
175 |
).contiguous() # nH, Wh*Ww, Wh*Ww
176 |
attn = attn + relative_position_bias.unsqueeze(0)
177 |
178 |
if mask is not None:
179 |
nW = mask.shape[0]
180 |
attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(
181 |
182 |
183 |
attn = attn.view(-1, self.num_heads, N, N)
184 |
attn = self.softmax(attn)
185 |
186 |
attn = self.softmax(attn)
187 |
188 |
attn = self.attn_drop(attn)
189 |
190 |
x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
191 |
x = self.proj(x)
192 |
x = self.proj_drop(x)
193 |
return x
194 |
195 |
196 |
class SwinTransformerBlock(nn.Module):
197 |
"""Swin Transformer Block.
198 |
199 |
dim (int): Number of input channels.
200 |
num_heads (int): Number of attention heads.
201 |
window_size (int): Window size.
202 |
shift_size (int): Shift size for SW-MSA.
203 |
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
204 |
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
205 |
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
206 |
drop (float, optional): Dropout rate. Default: 0.0
207 |
attn_drop (float, optional): Attention dropout rate. Default: 0.0
208 |
drop_path (float, optional): Stochastic depth rate. Default: 0.0
209 |
act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
210 |
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
211 |
212 |
213 |
def __init__(
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
self.dim = dim
230 |
self.num_heads = num_heads
231 |
self.window_size = window_size
232 |
self.shift_size = shift_size
233 |
self.mlp_ratio = mlp_ratio
234 |
assert (
235 |
0 <= self.shift_size < self.window_size
236 |
), "shift_size must in 0-window_size"
237 |
238 |
self.norm1 = norm_layer(dim)
239 |
self.attn = WindowAttention(
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
250 |
self.norm2 = norm_layer(dim)
251 |
mlp_hidden_dim = int(dim * mlp_ratio)
252 |
self.mlp = Mlp(
253 |
254 |
255 |
256 |
257 |
258 |
259 |
self.H = None
260 |
self.W = None
261 |
262 |
def forward(self, x, mask_matrix):
263 |
"""Forward function.
264 |
265 |
x: Input feature, tensor size (B, H*W, C).
266 |
H, W: Spatial resolution of the input feature.
267 |
mask_matrix: Attention mask for cyclic shift.
268 |
269 |
B, L, C = x.shape
270 |
H, W = self.H, self.W
271 |
assert L == H * W, "input feature has wrong size"
272 |
273 |
shortcut = x
274 |
x = self.norm1(x)
275 |
x = x.view(B, H, W, C)
276 |
277 |
# pad feature maps to multiples of window size
278 |
pad_l = pad_t = 0
279 |
pad_r = (self.window_size - W % self.window_size) % self.window_size
280 |
pad_b = (self.window_size - H % self.window_size) % self.window_size
281 |
x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
282 |
_, Hp, Wp, _ = x.shape
283 |
284 |
# cyclic shift
285 |
if self.shift_size > 0:
286 |
shifted_x = torch.roll(
287 |
x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)
288 |
289 |
attn_mask = mask_matrix
290 |
291 |
shifted_x = x
292 |
attn_mask = None
293 |
294 |
# partition windows
295 |
x_windows = window_partition(
296 |
shifted_x, self.window_size
297 |
) # nW*B, window_size, window_size, C
298 |
x_windows = x_windows.view(
299 |
-1, self.window_size * self.window_size, C
300 |
) # nW*B, window_size*window_size, C
301 |
302 |
303 |
attn_windows = self.attn(
304 |
x_windows, mask=attn_mask
305 |
) # nW*B, window_size*window_size, C
306 |
307 |
# merge windows
308 |
attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
309 |
shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
310 |
311 |
# reverse cyclic shift
312 |
if self.shift_size > 0:
313 |
x = torch.roll(
314 |
shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)
315 |
316 |
317 |
x = shifted_x
318 |
319 |
if pad_r > 0 or pad_b > 0:
320 |
x = x[:, :H, :W, :].contiguous()
321 |
322 |
x = x.view(B, H * W, C)
323 |
324 |
325 |
x = shortcut + self.drop_path(x)
326 |
x = x + self.drop_path(self.mlp(self.norm2(x)))
327 |
328 |
return x
329 |
330 |
331 |
class PatchMerging(nn.Module):
332 |
"""Patch Merging Layer
333 |
334 |
dim (int): Number of input channels.
335 |
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
336 |
337 |
338 |
def __init__(self, dim, norm_layer=nn.LayerNorm):
339 |
340 |
self.dim = dim
341 |
self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
342 |
self.norm = norm_layer(4 * dim)
343 |
344 |
def forward(self, x, H, W):
345 |
"""Forward function.
346 |
347 |
x: Input feature, tensor size (B, H*W, C).
348 |
H, W: Spatial resolution of the input feature.
349 |
350 |
B, L, C = x.shape
351 |
assert L == H * W, "input feature has wrong size"
352 |
353 |
x = x.view(B, H, W, C)
354 |
355 |
# padding
356 |
pad_input = (H % 2 == 1) or (W % 2 == 1)
357 |
if pad_input:
358 |
x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
359 |
360 |
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
361 |
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
362 |
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
363 |
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
364 |
x =[x0, x1, x2, x3], -1) # B H/2 W/2 4*C
365 |
x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
366 |
367 |
x = self.norm(x)
368 |
x = self.reduction(x)
369 |
370 |
return x
371 |
372 |
373 |
class BasicLayer(nn.Module):
374 |
"""A basic Swin Transformer layer for one stage.
375 |
376 |
dim (int): Number of feature channels
377 |
depth (int): Depths of this stage.
378 |
num_heads (int): Number of attention head.
379 |
window_size (int): Local window size. Default: 7.
380 |
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
381 |
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
382 |
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
383 |
drop (float, optional): Dropout rate. Default: 0.0
384 |
attn_drop (float, optional): Attention dropout rate. Default: 0.0
385 |
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
386 |
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
387 |
downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
388 |
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
389 |
390 |
391 |
def __init__(
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
self.window_size = window_size
409 |
self.shift_size = window_size // 2
410 |
self.depth = depth
411 |
self.use_checkpoint = use_checkpoint
412 |
413 |
# build blocks
414 |
self.blocks = nn.ModuleList(
415 |
416 |
417 |
418 |
419 |
420 |
shift_size=0 if (i % 2 == 0) else window_size // 2,
421 |
422 |
423 |
424 |
425 |
426 |
427 |
if isinstance(drop_path, list)
428 |
else drop_path,
429 |
430 |
431 |
for i in range(depth)
432 |
433 |
434 |
435 |
# patch merging layer
436 |
if downsample is not None:
437 |
self.downsample = downsample(dim=dim, norm_layer=norm_layer)
438 |
439 |
self.downsample = None
440 |
441 |
def forward(self, x, H, W):
442 |
"""Forward function.
443 |
444 |
x: Input feature, tensor size (B, H*W, C).
445 |
H, W: Spatial resolution of the input feature.
446 |
447 |
448 |
# calculate attention mask for SW-MSA
449 |
Hp = int(np.ceil(H / self.window_size)) * self.window_size
450 |
Wp = int(np.ceil(W / self.window_size)) * self.window_size
451 |
img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
452 |
h_slices = (
453 |
slice(0, -self.window_size),
454 |
slice(-self.window_size, -self.shift_size),
455 |
slice(-self.shift_size, None),
456 |
457 |
w_slices = (
458 |
slice(0, -self.window_size),
459 |
slice(-self.window_size, -self.shift_size),
460 |
slice(-self.shift_size, None),
461 |
462 |
cnt = 0
463 |
for h in h_slices:
464 |
for w in w_slices:
465 |
img_mask[:, h, w, :] = cnt
466 |
cnt += 1
467 |
468 |
mask_windows = window_partition(
469 |
img_mask, self.window_size
470 |
) # nW, window_size, window_size, 1
471 |
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
472 |
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
473 |
attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
474 |
attn_mask == 0, float(0.0)
475 |
476 |
477 |
for blk in self.blocks:
478 |
blk.H, blk.W = H, W
479 |
if self.use_checkpoint:
480 |
x = checkpoint.checkpoint(blk, x, attn_mask)
481 |
482 |
x = blk(x, attn_mask)
483 |
if self.downsample is not None:
484 |
x_down = self.downsample(x, H, W)
485 |
Wh, Ww = (H + 1) // 2, (W + 1) // 2
486 |
return x, H, W, x_down, Wh, Ww
487 |
488 |
return x, H, W, x, H, W
489 |
490 |
491 |
class PatchEmbed(nn.Module):
492 |
"""Image to Patch Embedding
493 |
494 |
patch_size (int): Patch token size. Default: 4.
495 |
in_chans (int): Number of input image channels. Default: 3.
496 |
embed_dim (int): Number of linear projection output channels. Default: 96.
497 |
norm_layer (nn.Module, optional): Normalization layer. Default: None
498 |
499 |
500 |
def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
501 |
502 |
patch_size = to_2tuple(patch_size)
503 |
self.patch_size = patch_size
504 |
505 |
self.in_chans = in_chans
506 |
self.embed_dim = embed_dim
507 |
508 |
self.proj = nn.Conv2d(
509 |
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
510 |
511 |
if norm_layer is not None:
512 |
self.norm = norm_layer(embed_dim)
513 |
514 |
self.norm = None
515 |
516 |
def forward(self, x):
517 |
"""Forward function."""
518 |
# padding
519 |
_, _, H, W = x.size()
520 |
if W % self.patch_size[1] != 0:
521 |
x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
522 |
if H % self.patch_size[0] != 0:
523 |
x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
524 |
525 |
x = self.proj(x) # B C Wh Ww
526 |
if self.norm is not None:
527 |
Wh, Ww = x.size(2), x.size(3)
528 |
x = x.flatten(2).transpose(1, 2)
529 |
x = self.norm(x)
530 |
x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
531 |
532 |
return x
533 |
534 |
535 |
class SwinTransformer(nn.Module):
536 |
"""Swin Transformer backbone.
537 |
A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
538 |
539 |
540 |
pretrain_img_size (int): Input image size for training the pretrained model,
541 |
used in absolute postion embedding. Default 224.
542 |
patch_size (int | tuple(int)): Patch size. Default: 4.
543 |
in_chans (int): Number of input image channels. Default: 3.
544 |
embed_dim (int): Number of linear projection output channels. Default: 96.
545 |
depths (tuple[int]): Depths of each Swin Transformer stage.
546 |
num_heads (tuple[int]): Number of attention head of each stage.
547 |
window_size (int): Window size. Default: 7.
548 |
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
549 |
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
550 |
qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
551 |
drop_rate (float): Dropout rate.
552 |
attn_drop_rate (float): Attention dropout rate. Default: 0.
553 |
drop_path_rate (float): Stochastic depth rate. Default: 0.2.
554 |
norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
555 |
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
556 |
patch_norm (bool): If True, add normalization after patch embedding. Default: True.
557 |
out_indices (Sequence[int]): Output from which stages.
558 |
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
559 |
-1 means not freezing any parameters.
560 |
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
561 |
562 |
563 |
def __init__(
564 |
565 |
566 |
567 |
568 |
569 |
depths=[2, 2, 6, 2],
570 |
num_heads=[3, 6, 12, 24],
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
out_indices=(0, 1, 2, 3),
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
self.pretrain_img_size = pretrain_img_size
591 |
self.num_layers = len(depths)
592 |
self.embed_dim = embed_dim
593 |
self.ape = ape
594 |
self.patch_norm = patch_norm
595 |
self.out_indices = out_indices
596 |
self.norm_indices = norm_indices if norm_indices is not None else out_indices
597 |
self.frozen_stages = frozen_stages
598 |
599 |
# split image into non-overlapping patches
600 |
self.patch_embed = PatchEmbed(
601 |
602 |
603 |
604 |
norm_layer=norm_layer if self.patch_norm else None,
605 |
606 |
607 |
# absolute position embedding
608 |
if self.ape:
609 |
pretrain_img_size = to_2tuple(pretrain_img_size)
610 |
patch_size = to_2tuple(patch_size)
611 |
patches_resolution = [
612 |
pretrain_img_size[0] // patch_size[0],
613 |
pretrain_img_size[1] // patch_size[1],
614 |
615 |
616 |
self.absolute_pos_embed = nn.Parameter(
617 |
torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1])
618 |
619 |
trunc_normal_(self.absolute_pos_embed, std=0.02)
620 |
621 |
self.pos_drop = nn.Dropout(p=drop_rate)
622 |
623 |
# stochastic depth
624 |
dpr = [
625 |
x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
626 |
] # stochastic depth decay rule
627 |
628 |
# build layers
629 |
self.layers = nn.ModuleList()
630 |
for i_layer in range(self.num_layers):
631 |
layer = BasicLayer(
632 |
dim=int(embed_dim * 2 ** i_layer),
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
642 |
643 |
downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
644 |
645 |
646 |
647 |
648 |
num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
649 |
self.num_features = num_features
650 |
651 |
# add a norm layer for each output
652 |
for i_layer in self.norm_indices:
653 |
if i_layer >= len(self.num_features):
654 |
655 |
layer = norm_layer(num_features[i_layer])
656 |
layer_name = f"norm{i_layer}"
657 |
self.add_module(layer_name, layer)
658 |
# add projector head
659 |
self.projection = projection
660 |
if projection:
661 |
self.project_dim = project_dim
662 |
self.norm = norm_layer(self.num_features[-1])
663 |
self.projector = nn.Linear(self.num_features[-1], project_dim, bias=False)
664 |
665 |
666 |
def _freeze_stages(self):
667 |
if self.frozen_stages >= 0:
668 |
669 |
for param in self.patch_embed.parameters():
670 |
param.requires_grad = False
671 |
672 |
if self.frozen_stages >= 1 and self.ape:
673 |
self.absolute_pos_embed.requires_grad = False
674 |
675 |
if self.frozen_stages >= 2:
676 |
677 |
for i in range(0, self.frozen_stages - 1):
678 |
m = self.layers[i]
679 |
680 |
for param in m.parameters():
681 |
param.requires_grad = False
682 |
683 |
def init_weights(self, pretrained=None):
684 |
"""Initialize the weights in backbone.
685 |
686 |
pretrained (str, optional): Path to pre-trained weights.
687 |
Defaults to None.
688 |
689 |
690 |
def _init_weights(m):
691 |
if isinstance(m, nn.Linear):
692 |
trunc_normal_(m.weight, std=0.02)
693 |
if isinstance(m, nn.Linear) and m.bias is not None:
694 |
nn.init.constant_(m.bias, 0)
695 |
elif isinstance(m, nn.LayerNorm):
696 |
nn.init.constant_(m.bias, 0)
697 |
nn.init.constant_(m.weight, 1.0)
698 |
699 |
def forward(self, x):
700 |
"""Forward function."""
701 |
x = self.patch_embed(x)
702 |
703 |
Wh, Ww = x.size(2), x.size(3)
704 |
if self.ape:
705 |
# interpolate the position embedding to the corresponding size
706 |
absolute_pos_embed = F.interpolate(
707 |
self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
708 |
709 |
x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C
710 |
711 |
x = x.flatten(2).transpose(1, 2)
712 |
x = self.pos_drop(x)
713 |
714 |
outs = {}
715 |
for i in range(self.num_layers):
716 |
layer = self.layers[i]
717 |
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
718 |
719 |
if i in self.out_indices:
720 |
if i in self.norm_indices:
721 |
norm_layer = getattr(self, f"norm{i}")
722 |
x_out = norm_layer(x_out)
723 |
out = (
724 |
x_out.view(-1, H, W, self.num_features[i])
725 |
.permute(0, 3, 1, 2)
726 |
727 |
728 |
outs["res{}".format(i + 2)] = out
729 |
if self.projection:
730 |
x_out = self.norm(x_out)
731 |
x_out = x_out.view(-1, H, W, self.num_features[-1]).contiguous()
732 |
outs["fc"] = self.projector(x_out).permute(0, 3, 1, 2)
733 |
734 |
return outs
735 |
736 |
def train(self, mode=True):
737 |
"""Convert the model into training mode while keep layers freezed."""
738 |
super(SwinTransformer, self).train(mode)
739 |
740 |
741 |
742 |
743 |
class D2SwinTransformer(SwinTransformer, Backbone):
744 |
def __init__(self, cfg, input_shape):
745 |
746 |
pretrain_img_size = cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE
747 |
patch_size = cfg.MODEL.SWIN.PATCH_SIZE
748 |
in_chans = 3
749 |
embed_dim = cfg.MODEL.SWIN.EMBED_DIM
750 |
depths = cfg.MODEL.SWIN.DEPTHS
751 |
num_heads = cfg.MODEL.SWIN.NUM_HEADS
752 |
window_size = cfg.MODEL.SWIN.WINDOW_SIZE
753 |
mlp_ratio = cfg.MODEL.SWIN.MLP_RATIO
754 |
qkv_bias = cfg.MODEL.SWIN.QKV_BIAS
755 |
qk_scale = cfg.MODEL.SWIN.QK_SCALE
756 |
drop_rate = cfg.MODEL.SWIN.DROP_RATE
757 |
attn_drop_rate = cfg.MODEL.SWIN.ATTN_DROP_RATE
758 |
drop_path_rate = cfg.MODEL.SWIN.DROP_PATH_RATE
759 |
norm_layer = nn.LayerNorm
760 |
ape = cfg.MODEL.SWIN.APE
761 |
patch_norm = cfg.MODEL.SWIN.PATCH_NORM
762 |
norm_indices = cfg.MODEL.SWIN.NORM_INDICES
763 |
projection = cfg.MODEL.SWIN.PROJECTION
764 |
project_dim = cfg.MODEL.SWIN.PROJECT_DIM
765 |
766 |
767 |
768 |
769 |
770 |
771 |
772 |
773 |
774 |
775 |
776 |
777 |
778 |
779 |
780 |
781 |
782 |
783 |
784 |
785 |
786 |
787 |
self._out_features = cfg.MODEL.SWIN.OUT_FEATURES
788 |
789 |
self._out_feature_strides = {
790 |
"res2": 4,
791 |
"res3": 8,
792 |
"res4": 16,
793 |
"res5": 32,
794 |
"fc": 32,
795 |
796 |
self._out_feature_channels = {
797 |
"res2": self.num_features[0],
798 |
"res3": self.num_features[1],
799 |
"res4": self.num_features[2],
800 |
"res5": self.num_features[3],
801 |
"fc": self.num_features[3],
802 |
803 |
804 |
def forward(self, x):
805 |
806 |
807 |
x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
808 |
809 |
dict[str->Tensor]: names and the corresponding features
810 |
811 |
assert (
812 |
x.dim() == 4
813 |
), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!"
814 |
outputs = {}
815 |
y = super().forward(x)
816 |
for k in y.keys():
817 |
if k in self._out_features:
818 |
outputs[k] = y[k]
819 |
return outputs
820 |
821 |
def output_shape(self):
822 |
return {
823 |
name: ShapeSpec(
824 |
825 |
826 |
827 |
for name in self._out_features
828 |
829 |
830 |
831 |
def size_divisibility(self):
832 |
return 32
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
from .text_template import (
5 |
6 |
7 |
8 |
9 |
from .adapter import ClipAdapter, MaskFormerClipAdapter
10 |
11 |
12 |
def build_text_prompt(cfg):
13 |
if cfg.TEXT_TEMPLATES == "predefined":
14 |
text_templates = PredefinedPromptExtractor(cfg.PREDEFINED_PROMPT_TEMPLATES)
15 |
elif cfg.TEXT_TEMPLATES == "imagenet":
16 |
text_templates = ImageNetPromptExtractor()
17 |
elif cfg.TEXT_TEMPLATES == "vild":
18 |
text_templates = VILDPromptExtractor()
19 |
20 |
raise NotImplementedError(
21 |
"Prompt learner {} is not supported".format(cfg.TEXT_TEMPLATES)
22 |
23 |
return text_templates
@@ -0,0 +1,206 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
# Modified by Feng Liang from
4 |
5 |
6 |
from typing import List
7 |
import torch
8 |
from torch import nn
9 |
from torch.nn import functional as F
10 |
from detectron2.structures import BitMasks
11 |
from .utils import build_clip_model, crop_with_mask
12 |
from .text_template import PromptExtractor
13 |
14 |
15 |
PIXEL_MEAN = (0.48145466, 0.4578275, 0.40821073)
16 |
PIXEL_STD = (0.26862954, 0.26130258, 0.27577711)
17 |
18 |
19 |
class ClipAdapter(nn.Module):
20 |
def __init__(self, clip_model_name: str, mask_prompt_depth: int, text_templates: PromptExtractor):
21 |
22 |
self.clip_model = build_clip_model(clip_model_name, mask_prompt_depth)
23 |
self.text_templates = text_templates
24 |
25 |
self.text_feature_buffer = {}
26 |
27 |
def forward(self, image: torch.Tensor, text: List[str], **kwargs):
28 |
image = self._preprocess_image(image, **kwargs)
29 |
text_feature = self.get_text_features(text) # k,feat_dim
30 |
image_features = self.get_image_features(image)
31 |
return self.get_sim_logits(text_feature, image_features)
32 |
33 |
def _preprocess_image(self, image: torch.Tensor):
34 |
return image
35 |
36 |
def _get_text_features(self, noun_list: List[str]):
37 |
left_noun_list = [
38 |
noun for noun in noun_list if noun not in self.text_feature_buffer
39 |
40 |
if len(left_noun_list) > 0:
41 |
left_text_features = self.text_templates(
42 |
left_noun_list, self.clip_model
43 |
44 |
45 |
46 |
noun: text_feature
47 |
for noun, text_feature in zip(
48 |
left_noun_list, left_text_features
49 |
50 |
51 |
52 |
return torch.stack([self.text_feature_buffer[noun] for noun in noun_list])
53 |
54 |
55 |
def get_text_features(self, noun_list: List[str]):
56 |
return self._get_text_features(noun_list)
57 |
58 |
def get_image_features(self, image: torch.Tensor):
59 |
image_features = self.clip_model.visual(image)
60 |
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
61 |
return image_features
62 |
63 |
def get_sim_logits(
64 |
65 |
text_features: torch.Tensor,
66 |
image_features: torch.Tensor,
67 |
temperature: float = 100,
68 |
69 |
return temperature * image_features @ text_features.T
70 |
71 |
def normalize_feature(self, feat: torch.Tensor):
72 |
return feat / feat.norm(dim=-1, keepdim=True)
73 |
74 |
75 |
class MaskFormerClipAdapter(ClipAdapter):
76 |
def __init__(
77 |
78 |
clip_model_name: str,
79 |
text_templates: PromptExtractor,
80 |
mask_fill: str = "mean",
81 |
mask_expand_ratio: float = 1.0,
82 |
mask_thr: float = 0.5,
83 |
mask_matting: bool = False,
84 |
region_resized: bool = True,
85 |
mask_prompt_depth: int = 0,
86 |
mask_prompt_fwd: bool = False,
87 |
88 |
super().__init__(clip_model_name, mask_prompt_depth, text_templates)
89 |
self.non_object_embedding = nn.Parameter(
90 |
torch.empty(1, self.clip_model.text_projection.shape[-1])
91 |
92 |
93 |
94 |
std=self.clip_model.transformer.width ** -0.5,
95 |
96 |
# for test
97 |
self.mask_fill = mask_fill
98 |
if self.mask_fill == "zero":
99 |
self.mask_fill = (0.0, 0.0, 0.0)
100 |
elif self.mask_fill == "mean":
101 |
self.mask_fill = [255.0 * c for c in PIXEL_MEAN]
102 |
103 |
raise NotImplementedError(
104 |
"Unknown mask_fill method: {}".format(self.mask_fill)
105 |
106 |
self.mask_expand_ratio = mask_expand_ratio
107 |
self.mask_thr = mask_thr
108 |
self.mask_matting = mask_matting
109 |
self.region_resized = region_resized
110 |
self.mask_prompt_fwd = mask_prompt_fwd
111 |
112 |
"pixel_mean", torch.Tensor(PIXEL_MEAN).reshape(1, 3, 1, 1) * 255.0
113 |
114 |
115 |
"pixel_std", torch.Tensor(PIXEL_STD).reshape(1, 3, 1, 1) * 255.0
116 |
117 |
118 |
def forward(
119 |
120 |
image: torch.Tensor,
121 |
text: List[str],
122 |
mask: torch.Tensor,
123 |
normalize: bool = True,
124 |
fwd_w_region_mask: bool = False,
125 |
126 |
(regions, unnorm_regions), region_masks, valid_flag = self._preprocess_image(image, mask, normalize=normalize)
127 |
if regions is None:
128 |
return None, valid_flag
129 |
if isinstance(regions, list):
130 |
assert NotImplementedError
131 |
image_features =
132 |
[self.get_image_features(image_i) for image_i in regions], dim=0
133 |
134 |
135 |
if self.mask_prompt_fwd:
136 |
image_features = self.get_image_features(regions, region_masks)
137 |
138 |
image_features = self.get_image_features(regions)
139 |
text_feature = self.get_text_features(text) # k,feat_dim
140 |
return self.get_sim_logits(text_feature, image_features), unnorm_regions, valid_flag
141 |
142 |
def get_image_features(self, image, region_masks=None):
143 |
image_features = self.clip_model.visual(image, region_masks)
144 |
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
145 |
return image_features
146 |
147 |
def _preprocess_image(
148 |
self, image: torch.Tensor, mask: torch.Tensor, normalize: bool = True
149 |
150 |
"""crop, mask and normalize the image
151 |
152 |
153 |
image ([type]): [C,H,W]
154 |
mask ([type]): [K,H,W
155 |
normalize (bool, optional): [description]. Defaults to True.
156 |
157 |
dtype = mask.dtype
158 |
bin_mask = mask > self.mask_thr
159 |
valid = bin_mask.sum(dim=(-1, -2)) > 0
160 |
bin_mask = bin_mask[valid]
161 |
mask = mask[valid]
162 |
if not self.mask_matting:
163 |
mask = bin_mask
164 |
bin_mask = BitMasks(bin_mask)
165 |
bboxes = bin_mask.get_bounding_boxes()
166 |
# crop,mask
167 |
regions = []
168 |
region_masks = []
169 |
for bbox, single_mask in zip(bboxes, mask):
170 |
region, region_mask = crop_with_mask(
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
if len(regions) == 0:
180 |
return None, valid
181 |
unnorm_regions = regions
182 |
if normalize:
183 |
regions = [(r - self.pixel_mean) / self.pixel_std for r in regions]
184 |
# resize
185 |
if self.region_resized:
186 |
regions = [
187 |
F.interpolate(r, size=(224, 224), mode="bicubic") for r in regions
188 |
189 |
regions =
190 |
region_masks = [
191 |
F.interpolate(r, size=(224, 224), mode="nearest") for r in region_masks
192 |
193 |
region_masks =
194 |
unnorm_regions = [
195 |
F.interpolate(r, size=(224, 224), mode="bicubic") for r in unnorm_regions
196 |
197 |
unnorm_regions =
198 |
return (regions, unnorm_regions), region_masks, valid
199 |
200 |
def get_text_features(self, noun_list: List[str]):
201 |
object_text_features = self._get_text_features(noun_list)
202 |
non_object_text_features = (
203 |
204 |
/ self.non_object_embedding.norm(dim=-1, keepdim=True)
205 |
206 |
return[object_text_features, non_object_text_features], dim=0)
@@ -0,0 +1,155 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
# Modified by Feng Liang from
4 |
5 |
6 |
7 |
from typing import List
8 |
9 |
import clip
10 |
import torch
11 |
from torch import nn
12 |
13 |
14 |
"a bad photo of a {}.",
15 |
"a photo of many {}.",
16 |
"a sculpture of a {}.",
17 |
"a photo of the hard to see {}.",
18 |
"a low resolution photo of the {}.",
19 |
"a rendering of a {}.",
20 |
"graffiti of a {}.",
21 |
"a bad photo of the {}.",
22 |
"a cropped photo of the {}.",
23 |
"a tattoo of a {}.",
24 |
"the embroidered {}.",
25 |
"a photo of a hard to see {}.",
26 |
"a bright photo of a {}.",
27 |
"a photo of a clean {}.",
28 |
"a photo of a dirty {}.",
29 |
"a dark photo of the {}.",
30 |
"a drawing of a {}.",
31 |
"a photo of my {}.",
32 |
"the plastic {}.",
33 |
"a photo of the cool {}.",
34 |
"a close-up photo of a {}.",
35 |
"a black and white photo of the {}.",
36 |
"a painting of the {}.",
37 |
"a painting of a {}.",
38 |
"a pixelated photo of the {}.",
39 |
"a sculpture of the {}.",
40 |
"a bright photo of the {}.",
41 |
"a cropped photo of a {}.",
42 |
"a plastic {}.",
43 |
"a photo of the dirty {}.",
44 |
"a jpeg corrupted photo of a {}.",
45 |
"a blurry photo of the {}.",
46 |
"a photo of the {}.",
47 |
"a good photo of the {}.",
48 |
"a rendering of the {}.",
49 |
"a {} in a video game.",
50 |
"a photo of one {}.",
51 |
"a doodle of a {}.",
52 |
"a close-up photo of the {}.",
53 |
"a photo of a {}.",
54 |
"the origami {}.",
55 |
"the {} in a video game.",
56 |
"a sketch of a {}.",
57 |
"a doodle of the {}.",
58 |
"a origami {}.",
59 |
"a low resolution photo of a {}.",
60 |
"the toy {}.",
61 |
"a rendition of the {}.",
62 |
"a photo of the clean {}.",
63 |
"a photo of a large {}.",
64 |
"a rendition of a {}.",
65 |
"a photo of a nice {}.",
66 |
"a photo of a weird {}.",
67 |
"a blurry photo of a {}.",
68 |
"a cartoon {}.",
69 |
"art of a {}.",
70 |
"a sketch of the {}.",
71 |
"a embroidered {}.",
72 |
"a pixelated photo of a {}.",
73 |
"itap of the {}.",
74 |
"a jpeg corrupted photo of the {}.",
75 |
"a good photo of a {}.",
76 |
"a plushie {}.",
77 |
"a photo of the nice {}.",
78 |
"a photo of the small {}.",
79 |
"a photo of the weird {}.",
80 |
"the cartoon {}.",
81 |
"art of the {}.",
82 |
"a drawing of the {}.",
83 |
"a photo of the large {}.",
84 |
"a black and white photo of a {}.",
85 |
"the plushie {}.",
86 |
"a dark photo of a {}.",
87 |
"itap of a {}.",
88 |
"graffiti of the {}.",
89 |
"a toy {}.",
90 |
"itap of my {}.",
91 |
"a photo of a cool {}.",
92 |
"a photo of a small {}.",
93 |
"a tattoo of the {}.",
94 |
95 |
96 |
97 |
"a photo of a {}.",
98 |
"This is a photo of a {}",
99 |
"There is a {} in the scene",
100 |
"There is the {} in the scene",
101 |
"a photo of a {} in the scene",
102 |
"a photo of a small {}.",
103 |
"a photo of a medium {}.",
104 |
"a photo of a large {}.",
105 |
"This is a photo of a small {}.",
106 |
"This is a photo of a medium {}.",
107 |
"This is a photo of a large {}.",
108 |
"There is a small {} in the scene.",
109 |
"There is a medium {} in the scene.",
110 |
"There is a large {} in the scene.",
111 |
112 |
113 |
class PromptExtractor(nn.Module):
114 |
def __init__(self):
115 |
116 |
self._buffer_init = False
117 |
118 |
def init_buffer(self, clip_model):
119 |
self._buffer_init = True
120 |
121 |
def forward(self, noun_list: List[str], clip_model: nn.Module):
122 |
raise NotImplementedError()
123 |
124 |
125 |
class PredefinedPromptExtractor(PromptExtractor):
126 |
def __init__(self, templates: List[str]):
127 |
128 |
self.templates = templates
129 |
130 |
def forward(self, noun_list: List[str], clip_model: nn.Module):
131 |
text_features_bucket = []
132 |
for template in self.templates:
133 |
noun_tokens = [clip.tokenize(template.format(noun)) for noun in noun_list]
134 |
text_inputs =
135 |
136 |
137 |
text_features = clip_model.encode_text(text_inputs)
138 |
text_features /= text_features.norm(dim=-1, keepdim=True)
139 |
140 |
del text_inputs
141 |
# ensemble by averaging
142 |
text_features = torch.stack(text_features_bucket).mean(dim=0)
143 |
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
144 |
145 |
return text_features
146 |
147 |
148 |
class ImageNetPromptExtractor(PredefinedPromptExtractor):
149 |
def __init__(self):
150 |
151 |
152 |
153 |
class VILDPromptExtractor(PredefinedPromptExtractor):
154 |
def __init__(self):
155 |
@@ -0,0 +1,81 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
from typing import Tuple
5 |
import numpy as np
6 |
import torch
7 |
import clip
8 |
from detectron2.utils.comm import get_local_rank, synchronize
9 |
10 |
11 |
def expand_box(
12 |
x1: float,
13 |
y1: float,
14 |
x2: float,
15 |
y2: float,
16 |
expand_ratio: float = 1.0,
17 |
max_h: int = None,
18 |
max_w: int = None,
19 |
20 |
cx = 0.5 * (x1 + x2)
21 |
cy = 0.5 * (y1 + y2)
22 |
w = x2 - x1
23 |
h = y2 - y1
24 |
w = w * expand_ratio
25 |
h = h * expand_ratio
26 |
box = [cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h]
27 |
if max_h is not None:
28 |
box[1] = max(0, box[1])
29 |
box[3] = min(max_h - 1, box[3])
30 |
if max_w is not None:
31 |
box[0] = max(0, box[0])
32 |
box[2] = min(max_w - 1, box[2])
33 |
return [int(b) for b in box]
34 |
35 |
36 |
def mask2box(mask: torch.Tensor):
37 |
# use naive way
38 |
row = torch.nonzero(mask.sum(dim=0))[:, 0]
39 |
if len(row) == 0:
40 |
return None
41 |
x1 = row.min()
42 |
x2 = row.max()
43 |
col = np.nonzero(mask.sum(dim=1))[:, 0]
44 |
y1 = col.min()
45 |
y2 = col.max()
46 |
return x1, y1, x2 + 1, y2 + 1
47 |
48 |
49 |
def crop_with_mask(
50 |
image: torch.Tensor,
51 |
mask: torch.Tensor,
52 |
bbox: torch.Tensor,
53 |
fill: Tuple[float, float, float] = (0, 0, 0),
54 |
expand_ratio: float = 1.0,
55 |
56 |
l, t, r, b = expand_box(*bbox, expand_ratio)
57 |
_, h, w = image.shape
58 |
l = max(l, 0)
59 |
t = max(t, 0)
60 |
r = min(r, w)
61 |
b = min(b, h)
62 |
new_image =
63 |
[image.new_full((1, b - t, r - l), fill_value=val) for val in fill]
64 |
65 |
# return image[:, t:b, l:r], mask[None, t:b, l:r]
66 |
return image[:, t:b, l:r] * mask[None, t:b, l:r] + (1 - mask[None, t:b, l:r]) * new_image, mask[None, t:b, l:r]
67 |
68 |
69 |
def build_clip_model(model: str, mask_prompt_depth: int = 0, frozen: bool = True):
70 |
rank = get_local_rank()
71 |
if rank == 0:
72 |
# download on rank 0 only
73 |
model, _ = clip.load(model, mask_prompt_depth=mask_prompt_depth, device="cpu")
74 |
75 |
if rank != 0:
76 |
model, _ = clip.load(model, mask_prompt_depth=mask_prompt_depth, device="cpu")
77 |
78 |
if frozen:
79 |
for param in model.parameters():
80 |
param.requires_grad = False
81 |
return model
@@ -0,0 +1,229 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Modified by Bowen Cheng from
3 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
4 |
5 |
6 |
MaskFormer criterion.
7 |
8 |
import torch
9 |
import torch.nn.functional as F
10 |
from torch import nn
11 |
12 |
from detectron2.utils.comm import get_world_size
13 |
14 |
from ..utils.misc import is_dist_avail_and_initialized, nested_tensor_from_tensor_list
15 |
16 |
17 |
def dice_loss(inputs, targets, num_masks):
18 |
19 |
Compute the DICE loss, similar to generalized IOU for masks
20 |
21 |
inputs: A float tensor of arbitrary shape.
22 |
The predictions for each example.
23 |
targets: A float tensor with the same shape as inputs. Stores the binary
24 |
classification label for each element in inputs
25 |
(0 for the negative class and 1 for the positive class).
26 |
27 |
inputs = inputs.sigmoid()
28 |
inputs = inputs.flatten(1)
29 |
numerator = 2 * (inputs * targets).sum(-1)
30 |
denominator = inputs.sum(-1) + targets.sum(-1)
31 |
loss = 1 - (numerator + 1) / (denominator + 1)
32 |
return loss.sum() / num_masks
33 |
34 |
35 |
def sigmoid_focal_loss(
36 |
inputs, targets, num_masks, alpha: float = 0.25, gamma: float = 2
37 |
38 |
39 |
Loss used in RetinaNet for dense detection:
40 |
41 |
inputs: A float tensor of arbitrary shape.
42 |
The predictions for each example.
43 |
targets: A float tensor with the same shape as inputs. Stores the binary
44 |
classification label for each element in inputs
45 |
(0 for the negative class and 1 for the positive class).
46 |
alpha: (optional) Weighting factor in range (0,1) to balance
47 |
positive vs negative examples. Default = -1 (no weighting).
48 |
gamma: Exponent of the modulating factor (1 - p_t) to
49 |
balance easy vs hard examples.
50 |
51 |
Loss tensor
52 |
53 |
prob = inputs.sigmoid()
54 |
ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
55 |
p_t = prob * targets + (1 - prob) * (1 - targets)
56 |
loss = ce_loss * ((1 - p_t) ** gamma)
57 |
58 |
if alpha >= 0:
59 |
alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
60 |
loss = alpha_t * loss
61 |
62 |
return loss.mean(1).sum() / num_masks
63 |
64 |
65 |
class SetCriterion(nn.Module):
66 |
"""This class computes the loss for DETR.
67 |
The process happens in two steps:
68 |
1) we compute hungarian assignment between ground truth boxes and the outputs of the model
69 |
2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
70 |
71 |
72 |
def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses):
73 |
"""Create the criterion.
74 |
75 |
num_classes: number of object categories, omitting the special no-object category
76 |
matcher: module able to compute a matching between targets and proposals
77 |
weight_dict: dict containing as key the names of the losses and as values their relative weight.
78 |
eos_coef: relative classification weight applied to the no-object category
79 |
losses: list of all the losses to be applied. See get_loss for list of available losses.
80 |
81 |
82 |
self.num_classes = num_classes
83 |
self.matcher = matcher
84 |
self.weight_dict = weight_dict
85 |
self.eos_coef = eos_coef
86 |
self.losses = losses
87 |
if eos_coef > 0:
88 |
89 |
empty_weight = torch.ones(self.num_classes + 1)
90 |
91 |
empty_weight[-1] = self.eos_coef
92 |
self.register_buffer("empty_weight", empty_weight)
93 |
self.use_ignore_idx = False
94 |
95 |
self.use_ignore_idx = True
96 |
self.cur_target = []
97 |
98 |
def loss_labels(self, outputs, targets, indices, num_masks):
99 |
"""Classification loss (NLL)
100 |
targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
101 |
102 |
assert "pred_logits" in outputs
103 |
src_logits = outputs["pred_logits"]
104 |
105 |
idx = self._get_src_permutation_idx(indices)
106 |
target_classes_o =
107 |
[t["labels"][J] for t, (_, J) in zip(targets, indices)]
108 |
109 |
target_classes = torch.full(
110 |
111 |
112 |
113 |
114 |
115 |
target_classes[idx] = target_classes_o
116 |
if self.use_ignore_idx:
117 |
loss_ce = F.cross_entropy(
118 |
src_logits.transpose(1, 2),
119 |
120 |
121 |
122 |
123 |
if "empty_weight" in outputs:
124 |
empty_weight =
125 |
[outputs["empty_weight"], self.empty_weight[-1:]]
126 |
127 |
128 |
empty_weight = self.empty_weight
129 |
loss_ce = F.cross_entropy(
130 |
src_logits.transpose(1, 2), target_classes, empty_weight
131 |
132 |
losses = {"loss_ce": loss_ce}
133 |
return losses
134 |
135 |
def loss_masks(self, outputs, targets, indices, num_masks):
136 |
"""Compute the losses related to the masks: the focal loss and the dice loss.
137 |
targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
138 |
139 |
assert "pred_masks" in outputs
140 |
141 |
src_idx = self._get_src_permutation_idx(indices)
142 |
tgt_idx = self._get_tgt_permutation_idx(indices)
143 |
src_masks = outputs["pred_masks"]
144 |
src_masks = src_masks[src_idx]
145 |
masks = [t["masks"] for t in targets]
146 |
# TODO use valid to mask invalid areas due to padding in loss
147 |
target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
148 |
target_masks =
149 |
target_masks = target_masks[tgt_idx]
150 |
151 |
# upsample predictions to the target size
152 |
src_masks = F.interpolate(
153 |
src_masks[:, None],
154 |
155 |
156 |
157 |
158 |
src_masks = src_masks[:, 0].flatten(1)
159 |
160 |
target_masks = target_masks.flatten(1)
161 |
target_masks = target_masks.view(src_masks.shape)
162 |
losses = {
163 |
"loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_masks),
164 |
"loss_dice": dice_loss(src_masks, target_masks, num_masks),
165 |
166 |
return losses
167 |
168 |
def _get_src_permutation_idx(self, indices):
169 |
# permute predictions following indices
170 |
batch_idx =
171 |
[torch.full_like(src, i) for i, (src, _) in enumerate(indices)]
172 |
173 |
src_idx =[src for (src, _) in indices])
174 |
return batch_idx, src_idx
175 |
176 |
def _get_tgt_permutation_idx(self, indices):
177 |
# permute targets following indices
178 |
batch_idx =
179 |
[torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]
180 |
181 |
tgt_idx =[tgt for (_, tgt) in indices])
182 |
return batch_idx, tgt_idx
183 |
184 |
def get_loss(self, loss, outputs, targets, indices, num_masks):
185 |
loss_map = {"labels": self.loss_labels, "masks": self.loss_masks}
186 |
assert loss in loss_map, f"do you really want to compute {loss} loss?"
187 |
return loss_map[loss](outputs, targets, indices, num_masks)
188 |
189 |
def forward(self, outputs, targets):
190 |
"""This performs the loss computation.
191 |
192 |
outputs: dict of tensors, see the output specification of the model for the format
193 |
targets: list of dicts, such that len(targets) == batch_size.
194 |
The expected keys in each dict depends on the losses applied, see each loss' doc
195 |
196 |
outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
197 |
198 |
# Retrieve the matching between the outputs of the last layer and the targets
199 |
indices = self.matcher(outputs_without_aux, targets)
200 |
201 |
# Compute the average number of target boxes accross all nodes, for normalization purposes
202 |
num_masks = sum(len(t["labels"]) for t in targets)
203 |
num_masks = torch.as_tensor(
204 |
[num_masks], dtype=torch.float, device=next(iter(outputs.values())).device
205 |
206 |
if is_dist_avail_and_initialized():
207 |
208 |
num_masks = torch.clamp(num_masks / get_world_size(), min=1).item()
209 |
210 |
# Compute all the requested losses
211 |
losses = {}
212 |
for loss in self.losses:
213 |
losses.update(self.get_loss(loss, outputs, targets, indices, num_masks))
214 |
215 |
# In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
216 |
if "aux_outputs" in outputs:
217 |
for i, aux_outputs in enumerate(outputs["aux_outputs"]):
218 |
indices = self.matcher(aux_outputs, targets)
219 |
for loss in self.losses:
220 |
l_dict = self.get_loss(
221 |
loss, aux_outputs, targets, indices, num_masks
222 |
223 |
l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
224 |
225 |
226 |
return losses
227 |
228 |
def clean_buffer(self):
229 |
self.cur_target = []
@@ -0,0 +1,2 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
@@ -0,0 +1,135 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
import logging
5 |
from copy import deepcopy
6 |
from typing import Callable, Dict, List, Optional, Tuple, Union
7 |
8 |
import fvcore.nn.weight_init as weight_init
9 |
from torch import nn
10 |
from torch.nn import functional as F
11 |
12 |
from detectron2.config import configurable
13 |
from detectron2.layers import Conv2d, ShapeSpec, get_norm
14 |
from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
15 |
16 |
from ..transformer.transformer_predictor import TransformerPredictor
17 |
from .pixel_decoder import build_pixel_decoder
18 |
19 |
20 |
21 |
class MaskFormerHead(nn.Module):
22 |
23 |
_version = 2
24 |
25 |
def _load_from_state_dict(
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
version = local_metadata.get("version", None)
36 |
if version is None or version < 2:
37 |
# Do not warn if train from scratch
38 |
scratch = True
39 |
logger = logging.getLogger(__name__)
40 |
for k in list(state_dict.keys()):
41 |
newk = k
42 |
if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
43 |
newk = k.replace(prefix, prefix + "pixel_decoder.")
44 |
# logger.debug(f"{k} ==> {newk}")
45 |
if newk != k:
46 |
state_dict[newk] = state_dict[k]
47 |
del state_dict[k]
48 |
scratch = False
49 |
50 |
if not scratch:
51 |
52 |
f"Weight format of {self.__class__.__name__} have changed! "
53 |
"Please upgrade your models. Applying automatic conversion now ..."
54 |
55 |
56 |
57 |
def __init__(
58 |
59 |
input_shape: Dict[str, ShapeSpec],
60 |
61 |
num_classes: int,
62 |
pixel_decoder: nn.Module,
63 |
loss_weight: float = 1.0,
64 |
ignore_value: int = -1,
65 |
# extra parameters
66 |
transformer_predictor: nn.Module,
67 |
transformer_in_feature: str,
68 |
69 |
70 |
NOTE: this interface is experimental.
71 |
72 |
input_shape: shapes (channels and stride) of the input features
73 |
num_classes: number of classes to predict
74 |
pixel_decoder: the pixel decoder module
75 |
loss_weight: loss weight
76 |
ignore_value: category id to be ignored during training.
77 |
transformer_predictor: the transformer decoder that makes prediction
78 |
transformer_in_feature: input feature name to the transformer_predictor
79 |
80 |
81 |
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
82 |
self.in_features = [k for k, v in input_shape]
83 |
feature_strides = [v.stride for k, v in input_shape]
84 |
feature_channels = [v.channels for k, v in input_shape]
85 |
86 |
self.ignore_value = ignore_value
87 |
self.common_stride = 4
88 |
self.loss_weight = loss_weight
89 |
90 |
self.pixel_decoder = pixel_decoder
91 |
self.predictor = transformer_predictor
92 |
self.transformer_in_feature = transformer_in_feature
93 |
94 |
self.num_classes = num_classes
95 |
96 |
97 |
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
98 |
return {
99 |
"input_shape": {
100 |
k: v
101 |
for k, v in input_shape.items()
102 |
103 |
104 |
"ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
105 |
"num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
106 |
"pixel_decoder": build_pixel_decoder(cfg, input_shape),
107 |
"loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
108 |
"transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
109 |
"transformer_predictor": TransformerPredictor(
110 |
111 |
112 |
if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder"
113 |
else input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels,
114 |
115 |
116 |
117 |
118 |
def forward(self, features):
119 |
return self.layers(features)
120 |
121 |
def layers(self, features):
122 |
123 |
124 |
125 |
) = self.pixel_decoder.forward_features(features)
126 |
if self.transformer_in_feature == "transformer_encoder":
127 |
assert (
128 |
transformer_encoder_features is not None
129 |
), "Please use the TransformerEncoderPixelDecoder."
130 |
predictions = self.predictor(transformer_encoder_features, mask_features)
131 |
132 |
predictions = self.predictor(
133 |
features[self.transformer_in_feature], mask_features
134 |
135 |
return predictions
@@ -0,0 +1,145 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
# Modified by Feng Liang from
4 |
5 |
6 |
import logging
7 |
from copy import deepcopy
8 |
from typing import Callable, Dict, List, Optional, Tuple, Union
9 |
10 |
import fvcore.nn.weight_init as weight_init
11 |
from torch import nn
12 |
from torch.nn import functional as F
13 |
14 |
from detectron2.config import configurable
15 |
from detectron2.layers import Conv2d, ShapeSpec, get_norm
16 |
from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
17 |
18 |
from ..transformer.open_vocab_transformer_predictor import OpenVocabTransformerPredictor
19 |
from .pixel_decoder import build_pixel_decoder
20 |
21 |
22 |
23 |
class OpenVocabMaskFormerHead(nn.Module):
24 |
25 |
_version = 2
26 |
27 |
def _load_from_state_dict(
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
version = local_metadata.get("version", None)
38 |
if version is None or version < 2:
39 |
# Do not warn if train from scratch
40 |
scratch = True
41 |
logger = logging.getLogger(__name__)
42 |
for k in list(state_dict.keys()):
43 |
newk = k
44 |
if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
45 |
newk = k.replace(prefix, prefix + "pixel_decoder.")
46 |
# logger.debug(f"{k} ==> {newk}")
47 |
if newk != k:
48 |
state_dict[newk] = state_dict[k]
49 |
del state_dict[k]
50 |
scratch = False
51 |
52 |
if not scratch:
53 |
54 |
f"Weight format of {self.__class__.__name__} have changed! "
55 |
"Please upgrade your models. Applying automatic conversion now ..."
56 |
57 |
58 |
59 |
def __init__(
60 |
61 |
input_shape: Dict[str, ShapeSpec],
62 |
63 |
num_classes: int,
64 |
pixel_decoder: nn.Module,
65 |
loss_weight: float = 1.0,
66 |
ignore_value: int = -1,
67 |
# extra parameters
68 |
transformer_predictor: nn.Module,
69 |
transformer_in_feature: str,
70 |
71 |
72 |
NOTE: this interface is experimental.
73 |
74 |
input_shape: shapes (channels and stride) of the input features
75 |
num_classes: number of classes to predict
76 |
pixel_decoder: the pixel decoder module
77 |
loss_weight: loss weight
78 |
ignore_value: category id to be ignored during training.
79 |
transformer_predictor: the transformer decoder that makes prediction
80 |
transformer_in_feature: input feature name to the transformer_predictor
81 |
82 |
83 |
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
84 |
self.in_features = [k for k, v in input_shape]
85 |
feature_strides = [v.stride for k, v in input_shape]
86 |
feature_channels = [v.channels for k, v in input_shape]
87 |
88 |
self.ignore_value = ignore_value
89 |
self.common_stride = 4
90 |
self.loss_weight = loss_weight
91 |
92 |
self.pixel_decoder = pixel_decoder
93 |
self.predictor = transformer_predictor
94 |
self.transformer_in_feature = transformer_in_feature
95 |
96 |
self.num_classes = num_classes
97 |
98 |
99 |
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
100 |
return {
101 |
"input_shape": {
102 |
k: v
103 |
for k, v in input_shape.items()
104 |
105 |
106 |
"ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
107 |
"num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
108 |
"pixel_decoder": build_pixel_decoder(cfg, input_shape),
109 |
"loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
110 |
"transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
111 |
"transformer_predictor": OpenVocabTransformerPredictor(
112 |
113 |
114 |
if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder"
115 |
else input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels,
116 |
117 |
118 |
119 |
120 |
def forward(self, features):
121 |
return self.layers(features)
122 |
123 |
def layers(self, features):
124 |
125 |
126 |
127 |
) = self.pixel_decoder.forward_features(features)
128 |
if self.transformer_in_feature == "transformer_encoder":
129 |
assert (
130 |
transformer_encoder_features is not None
131 |
), "Please use the TransformerEncoderPixelDecoder."
132 |
predictions = self.predictor(transformer_encoder_features, mask_features)
133 |
134 |
predictions = self.predictor(
135 |
features[self.transformer_in_feature], mask_features
136 |
137 |
return predictions
138 |
139 |
def freeze_pretrained(self):
140 |
for name, module in self.named_children():
141 |
if name not in ["predictor"]:
142 |
for param in module.parameters():
143 |
param.requires_grad = False
144 |
145 |
@@ -0,0 +1,308 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 |
import logging
5 |
from typing import Callable, Dict, List, Optional, Tuple, Union
6 |
7 |
import fvcore.nn.weight_init as weight_init
8 |
from torch import nn
9 |
from torch.nn import functional as F
10 |
11 |
from detectron2.config import configurable
12 |
from detectron2.layers import Conv2d, ShapeSpec, get_norm
13 |
from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
14 |
15 |
from ..transformer.position_encoding import PositionEmbeddingSine
16 |
from ..transformer.transformer import TransformerEncoder, TransformerEncoderLayer
17 |
18 |
19 |
def build_pixel_decoder(cfg, input_shape):
20 |
21 |
Build a pixel decoder from `cfg.MODEL.MASK_FORMER.PIXEL_DECODER_NAME`.
22 |
23 |
24 |
model = SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
25 |
forward_features = getattr(model, "forward_features", None)
26 |
if not callable(forward_features):
27 |
raise ValueError(
28 |
"Only SEM_SEG_HEADS with forward_features method can be used as pixel decoder. "
29 |
f"Please implement forward_features for {name} to only return mask features."
30 |
31 |
return model
32 |
33 |
34 |
35 |
class BasePixelDecoder(nn.Module):
36 |
37 |
def __init__(
38 |
39 |
input_shape: Dict[str, ShapeSpec],
40 |
41 |
conv_dim: int,
42 |
mask_dim: int,
43 |
norm: Optional[Union[str, Callable]] = None,
44 |
45 |
46 |
NOTE: this interface is experimental.
47 |
48 |
input_shape: shapes (channels and stride) of the input features
49 |
conv_dims: number of output channels for the intermediate conv layers.
50 |
mask_dim: number of output channels for the final conv layer.
51 |
norm (str or callable): normalization for all conv layers
52 |
53 |
54 |
55 |
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
56 |
self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5"
57 |
feature_channels = [v.channels for k, v in input_shape]
58 |
59 |
lateral_convs = []
60 |
output_convs = []
61 |
62 |
use_bias = norm == ""
63 |
for idx, in_channels in enumerate(feature_channels):
64 |
if idx == len(self.in_features) - 1:
65 |
output_norm = get_norm(norm, conv_dim)
66 |
output_conv = Conv2d(
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
self.add_module("layer_{}".format(idx + 1), output_conv)
78 |
79 |
80 |
81 |
82 |
lateral_norm = get_norm(norm, conv_dim)
83 |
output_norm = get_norm(norm, conv_dim)
84 |
85 |
lateral_conv = Conv2d(
86 |
87 |
88 |
89 |
90 |
91 |
92 |
output_conv = Conv2d(
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
self.add_module("adapter_{}".format(idx + 1), lateral_conv)
105 |
self.add_module("layer_{}".format(idx + 1), output_conv)
106 |
107 |
108 |
109 |
# Place convs into top-down order (from low to high resolution)
110 |
# to make the top-down computation in forward clearer.
111 |
self.lateral_convs = lateral_convs[::-1]
112 |
self.output_convs = output_convs[::-1]
113 |
114 |
self.mask_dim = mask_dim
115 |
self.mask_features = Conv2d(
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
126 |
ret = {}
127 |
ret["input_shape"] = {
128 |
k: v
129 |
for k, v in input_shape.items()
130 |
131 |
132 |
ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
133 |
ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
134 |
ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM
135 |
return ret
136 |
137 |
def forward_features(self, features):
138 |
# Reverse feature maps into top-down order (from low to high resolution)
139 |
for idx, f in enumerate(self.in_features[::-1]):
140 |
x = features[f]
141 |
lateral_conv = self.lateral_convs[idx]
142 |
output_conv = self.output_convs[idx]
143 |
if lateral_conv is None:
144 |
y = output_conv(x)
145 |
146 |
cur_fpn = lateral_conv(x)
147 |
# Following FPN implementation, we use nearest upsampling here
148 |
y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest")
149 |
y = output_conv(y)
150 |
return self.mask_features(y), None
151 |
152 |
def forward(self, features, targets=None):
153 |
logger = logging.getLogger(__name__)
154 |
155 |
"Calling forward() may cause unpredicted behavior of PixelDecoder module."
156 |
157 |
return self.forward_features(features)
158 |
159 |
160 |
class TransformerEncoderOnly(nn.Module):
161 |
def __init__(
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
encoder_layer = TransformerEncoderLayer(
174 |
d_model, nhead, dim_feedforward, dropout, activation, normalize_before
175 |
176 |
encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
177 |
self.encoder = TransformerEncoder(
178 |
encoder_layer, num_encoder_layers, encoder_norm
179 |
180 |
181 |
182 |
183 |
self.d_model = d_model
184 |
self.nhead = nhead
185 |
186 |
def _reset_parameters(self):
187 |
for p in self.parameters():
188 |
if p.dim() > 1:
189 |
190 |
191 |
def forward(self, src, mask, pos_embed):
192 |
# flatten NxCxHxW to HWxNxC
193 |
bs, c, h, w = src.shape
194 |
src = src.flatten(2).permute(2, 0, 1)
195 |
pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
196 |
if mask is not None:
197 |
mask = mask.flatten(1)
198 |
199 |
memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
200 |
return memory.permute(1, 2, 0).view(bs, c, h, w)
201 |
202 |
203 |
204 |
class TransformerEncoderPixelDecoder(BasePixelDecoder):
205 |
206 |
def __init__(
207 |
208 |
input_shape: Dict[str, ShapeSpec],
209 |
210 |
transformer_dropout: float,
211 |
transformer_nheads: int,
212 |
transformer_dim_feedforward: int,
213 |
transformer_enc_layers: int,
214 |
transformer_pre_norm: bool,
215 |
conv_dim: int,
216 |
mask_dim: int,
217 |
norm: Optional[Union[str, Callable]] = None,
218 |
219 |
220 |
NOTE: this interface is experimental.
221 |
222 |
input_shape: shapes (channels and stride) of the input features
223 |
transformer_dropout: dropout probability in transformer
224 |
transformer_nheads: number of heads in transformer
225 |
transformer_dim_feedforward: dimension of feedforward network
226 |
transformer_enc_layers: number of transformer encoder layers
227 |
transformer_pre_norm: whether to use pre-layernorm or not
228 |
conv_dims: number of output channels for the intermediate conv layers.
229 |
mask_dim: number of output channels for the final conv layer.
230 |
norm (str or callable): normalization for all conv layers
231 |
232 |
super().__init__(input_shape, conv_dim=conv_dim, mask_dim=mask_dim, norm=norm)
233 |
234 |
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
235 |
self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5"
236 |
feature_strides = [v.stride for k, v in input_shape]
237 |
feature_channels = [v.channels for k, v in input_shape]
238 |
239 |
in_channels = feature_channels[len(self.in_features) - 1]
240 |
self.input_proj = Conv2d(in_channels, conv_dim, kernel_size=1)
241 |
242 |
self.transformer = TransformerEncoderOnly(
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
N_steps = conv_dim // 2
251 |
self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
252 |
253 |
# update layer
254 |
use_bias = norm == ""
255 |
output_norm = get_norm(norm, conv_dim)
256 |
output_conv = Conv2d(
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
delattr(self, "layer_{}".format(len(self.in_features)))
268 |
self.add_module("layer_{}".format(len(self.in_features)), output_conv)
269 |
self.output_convs[0] = output_conv
270 |
271 |
272 |
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
273 |
ret = super().from_config(cfg, input_shape)
274 |
ret["transformer_dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
275 |
ret["transformer_nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
276 |
ret["transformer_dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
277 |
278 |
279 |
280 |
ret["transformer_pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
281 |
return ret
282 |
283 |
def forward_features(self, features):
284 |
# Reverse feature maps into top-down order (from low to high resolution)
285 |
for idx, f in enumerate(self.in_features[::-1]):
286 |
x = features[f]
287 |
lateral_conv = self.lateral_convs[idx]
288 |
output_conv = self.output_convs[idx]
289 |
if lateral_conv is None:
290 |
transformer = self.input_proj(x)
291 |
pos = self.pe_layer(x)
292 |
transformer = self.transformer(transformer, None, pos)
293 |
y = output_conv(transformer)
294 |
# save intermediate feature as input to Transformer decoder
295 |
transformer_encoder_features = transformer
296 |
297 |
cur_fpn = lateral_conv(x)
298 |
# Following FPN implementation, we use nearest upsampling here
299 |
y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest")
300 |
y = output_conv(y)
301 |
return self.mask_features(y), transformer_encoder_features
302 |
303 |
def forward(self, features, targets=None):
304 |
logger = logging.getLogger(__name__)
305 |
306 |
"Calling forward() may cause unpredicted behavior of PixelDecoder module."
307 |
308 |
return self.forward_features(features)
@@ -0,0 +1,187 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Modified by Bowen Cheng from
3 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
4 |
5 |
6 |
Modules to compute the matching cost and solve the corresponding LSAP.
7 |
8 |
import torch
9 |
import torch.nn.functional as F
10 |
from scipy.optimize import linear_sum_assignment
11 |
from torch import nn
12 |
13 |
14 |
def batch_dice_loss(inputs, targets):
15 |
16 |
Compute the DICE loss, similar to generalized IOU for masks
17 |
18 |
inputs: A float tensor of arbitrary shape.
19 |
The predictions for each example.
20 |
targets: A float tensor with the same shape as inputs. Stores the binary
21 |
classification label for each element in inputs
22 |
(0 for the negative class and 1 for the positive class).
23 |
24 |
inputs = inputs.sigmoid()
25 |
inputs = inputs.flatten(1)
26 |
numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
27 |
denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
28 |
loss = 1 - (numerator + 1) / (denominator + 1)
29 |
return loss
30 |
31 |
32 |
def batch_sigmoid_focal_loss(inputs, targets, alpha: float = 0.25, gamma: float = 2):
33 |
34 |
Loss used in RetinaNet for dense detection:
35 |
36 |
inputs: A float tensor of arbitrary shape.
37 |
The predictions for each example.
38 |
targets: A float tensor with the same shape as inputs. Stores the binary
39 |
classification label for each element in inputs
40 |
(0 for the negative class and 1 for the positive class).
41 |
alpha: (optional) Weighting factor in range (0,1) to balance
42 |
positive vs negative examples. Default = -1 (no weighting).
43 |
gamma: Exponent of the modulating factor (1 - p_t) to
44 |
balance easy vs hard examples.
45 |
46 |
Loss tensor
47 |
48 |
hw = inputs.shape[1]
49 |
50 |
prob = inputs.sigmoid()
51 |
focal_pos = ((1 - prob) ** gamma) * F.binary_cross_entropy_with_logits(
52 |
inputs, torch.ones_like(inputs), reduction="none"
53 |
54 |
focal_neg = (prob ** gamma) * F.binary_cross_entropy_with_logits(
55 |
inputs, torch.zeros_like(inputs), reduction="none"
56 |
57 |
if alpha >= 0:
58 |
focal_pos = focal_pos * alpha
59 |
focal_neg = focal_neg * (1 - alpha)
60 |
61 |
loss = torch.einsum("nc,mc->nm", focal_pos, targets) + torch.einsum(
62 |
"nc,mc->nm", focal_neg, (1 - targets)
63 |
64 |
65 |
return loss / hw
66 |
67 |
68 |
class HungarianMatcher(nn.Module):
69 |
"""This class computes an assignment between the targets and the predictions of the network
70 |
71 |
For efficiency reasons, the targets don't include the no_object. Because of this, in general,
72 |
there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
73 |
while the others are un-matched (and thus treated as non-objects).
74 |
75 |
76 |
def __init__(
77 |
self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1
78 |
79 |
"""Creates the matcher
80 |
81 |
82 |
cost_class: This is the relative weight of the classification error in the matching cost
83 |
cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
84 |
cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
85 |
86 |
87 |
self.cost_class = cost_class
88 |
self.cost_mask = cost_mask
89 |
self.cost_dice = cost_dice
90 |
assert (
91 |
cost_class != 0 or cost_mask != 0 or cost_dice != 0
92 |
), "all costs cant be 0"
93 |
94 |
95 |
def memory_efficient_forward(self, outputs, targets):
96 |
"""More memory-friendly matching"""
97 |
bs, num_queries = outputs["pred_logits"].shape[:2]
98 |
99 |
# Work out the mask padding size
100 |
masks = [v["masks"] for v in targets]
101 |
h_max = max([m.shape[1] for m in masks])
102 |
w_max = max([m.shape[2] for m in masks])
103 |
104 |
indices = []
105 |
106 |
# Iterate through batch size
107 |
for b in range(bs):
108 |
109 |
out_prob = outputs["pred_logits"][b].softmax(
110 |
111 |
) # [num_queries, num_classes]
112 |
out_mask = outputs["pred_masks"][b] # [num_queries, H_pred, W_pred]
113 |
114 |
tgt_ids = targets[b]["labels"]
115 |
# gt masks are already padded when preparing target
116 |
tgt_mask = targets[b]["masks"].to(out_mask)
117 |
118 |
# Compute the classification cost. Contrary to the loss, we don't use the NLL,
119 |
# but approximate it in 1 - proba[target class].
120 |
# The 1 is a constant that doesn't change the matching, it can be ommitted.
121 |
cost_class = -out_prob[:, tgt_ids]
122 |
123 |
# Downsample gt masks to save memory
124 |
tgt_mask = F.interpolate(
125 |
tgt_mask[:, None], size=out_mask.shape[-2:], mode="nearest"
126 |
127 |
128 |
# Flatten spatial dimension
129 |
out_mask = out_mask.flatten(1) # [batch_size * num_queries, H*W]
130 |
tgt_mask = tgt_mask[:, 0].flatten(1) # [num_total_targets, H*W]
131 |
132 |
# Compute the focal loss between masks
133 |
cost_mask = batch_sigmoid_focal_loss(out_mask, tgt_mask)
134 |
135 |
# Compute the dice loss betwen masks
136 |
cost_dice = batch_dice_loss(out_mask, tgt_mask)
137 |
138 |
# Final cost matrix
139 |
C = (
140 |
self.cost_mask * cost_mask
141 |
+ self.cost_class * cost_class
142 |
+ self.cost_dice * cost_dice
143 |
144 |
C = C.reshape(num_queries, -1).cpu()
145 |
146 |
147 |
return [
148 |
149 |
torch.as_tensor(i, dtype=torch.int64),
150 |
torch.as_tensor(j, dtype=torch.int64),
151 |
152 |
for i, j in indices
153 |
154 |
155 |
156 |
def forward(self, outputs, targets):
157 |
"""Performs the matching
158 |
159 |
160 |
outputs: This is a dict that contains at least these entries:
161 |
"pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
162 |
"pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
163 |
164 |
targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
165 |
"labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
166 |
objects in the target) containing the class labels
167 |
"masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
168 |
169 |
170 |
A list of size batch_size, containing tuples of (index_i, index_j) where:
171 |
- index_i is the indices of the selected predictions (in order)
172 |
- index_j is the indices of the corresponding selected targets (in order)
173 |
For each batch element, it holds:
174 |
len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
175 |
176 |
return self.memory_efficient_forward(outputs, targets)
177 |
178 |
def __repr__(self):
179 |
head = "Matcher " + self.__class__.__name__
180 |
body = [
181 |
"cost_class: {}".format(self.cost_class),
182 |
"cost_mask: {}".format(self.cost_mask),
183 |
"cost_dice: {}".format(self.cost_dice),
184 |
185 |
_repr_indent = 4
186 |
lines = [head] + [" " * _repr_indent + line for line in body]
187 |
return "\n".join(lines)
@@ -0,0 +1,2 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
@@ -0,0 +1,84 @@
1 |
# Copyright (c) Facebook, Inc. and its affiliates.
2 |
# Modified by Bowen Cheng from:
3 |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
4 |
5 |
from torch import nn
6 |
from detectron2.config import configurable
7 |
from .transformer_predictor import TransformerPredictor, MLP
8 |
9 |
10 |
class OpenVocabTransformerPredictor(TransformerPredictor):
11 |
12 |
def __init__(
13 |
14 |
15 |
16 |
17 |
embedding_dim: int,
18 |
embed_hidden_dim: int,
19 |
embed_layers: int,
20 |
hidden_dim: int,
21 |
num_queries: int,
22 |
nheads: int,
23 |
dropout: float,
24 |
dim_feedforward: int,
25 |
enc_layers: int,
26 |
dec_layers: int,
27 |
pre_norm: bool,
28 |
deep_supervision: bool,
29 |
mask_dim: int,
30 |
enforce_input_project: bool,
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
self.mask_classification = mask_classification
49 |
# output FFNs
50 |
if self.mask_classification:
51 |
self.class_embed = MLP(
52 |
hidden_dim, embed_hidden_dim, embedding_dim, embed_layers
53 |
54 |
55 |
def freeze_pretrained(self):
56 |
for name, module in self.named_children():
57 |
if name not in ["class_embed"]:
58 |
for param in module.parameters():
59 |
param.requires_grad = False
60 |
61 |
62 |
def from_config(cls, cfg, in_channels, mask_classification):
63 |
ret = {}
64 |
ret["in_channels"] = in_channels
65 |
ret["mask_classification"] = mask_classification
66 |
67 |
ret["embedding_dim"] = cfg.MODEL.SEM_SEG_HEAD.EMBEDDING_DIM
68 |
ret["embed_hidden_dim"] = cfg.MODEL.SEM_SEG_HEAD.EMBED_HIDDEN_DIM
69 |
ret["embed_layers"] = cfg.MODEL.SEM_SEG_HEAD.EMBED_LAYERS
70 |
ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
71 |
72 |
# Transformer parameters:
73 |
ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
74 |
ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
75 |
ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
76 |
ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS
77 |
ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS
78 |
ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
79 |
ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
80 |
ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
81 |
82 |
ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
83 |
84 |
return ret