ARGenSeg/index.html at master · xlwangDev/ARGenSeg · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">

  <!-- Primary Meta Tags -->
  <meta name="title" content="ARGenSeg: Image Segmentation with Autoregressive Image Generation Model - Xiaolong Wang, Lixiang Ru, Ziyuan Huang, Kaixiang Ji, Dandan Zheng, Jingdong Chen, Jun Zhou">
  <meta name="description" content="We propose ARGenSeg, a novel AutoRegressive Generation-based paradigm for image Segmentation, achieving multimodal understanding and pixel-level perception within a unified framework.">
  <meta name="keywords" content="image segmentation, autoregressive generation, multimodal learning, computer vision, MLLM, VQ-VAE, pixel-level perception">
  <meta name="author" content="Xiaolong Wang, Lixiang Ru, Ziyuan Huang, Kaixiang Ji, Dandan Zheng, Jingdong Chen, Jun Zhou">
  <meta name="robots" content="index, follow">
  <meta name="language" content="English">

  <!-- Open Graph / Facebook -->
  <meta property="og:type" content="article">
  <meta property="og:site_name" content="Ant Group">
  <meta property="og:title" content="ARGenSeg: Image Segmentation with Autoregressive Image Generation Model">
  <meta property="og:description" content="We propose ARGenSeg, a novel AutoRegressive Generation-based paradigm for image Segmentation, achieving multimodal understanding and pixel-level perception within a unified framework.">
  <meta property="og:url" content="https://arxiv.org/abs/2510.20803">
  <meta property="og:image" content="https://YOUR_DOMAIN.com/static/images/social_preview.png">
  <meta property="og:image:width" content="1200">
  <meta property="og:image:height" content="630">
  <meta property="og:image:alt" content="ARGenSeg - Image Segmentation with Autoregressive Image Generation Model">
  <meta property="article:published_time" content="2024-10-28T00:00:00.000Z">
  <meta property="article:author" content="Xiaolong Wang">
  <meta property="article:section" content="Research">
  <meta property="article:tag" content="image segmentation">
  <meta property="article:tag" content="autoregressive generation">

  <!-- Twitter -->
  <meta name="twitter:card" content="summary_large_image">
  <meta name="twitter:site" content="@AntGroup">
  <meta name="twitter:creator" content="@XiaolongWang">
  <meta name="twitter:title" content="ARGenSeg: Image Segmentation with Autoregressive Image Generation Model">
  <meta name="twitter:description" content="We propose ARGenSeg, a novel AutoRegressive Generation-based paradigm for image Segmentation, achieving multimodal understanding and pixel-level perception within a unified framework.">
  <meta name="twitter:image" content="https://YOUR_DOMAIN.com/static/images/social_preview.png">
  <meta name="twitter:image:alt" content="ARGenSeg - Image Segmentation with Autoregressive Image Generation Model">

  <!-- Additional SEO -->
  <meta name="theme-color" content="#2563eb">
  <meta name="msapplication-TileColor" content="#2563eb">
  <meta name="apple-mobile-web-app-capable" content="yes">
  <meta name="apple-mobile-web-app-status-bar-style" content="default">

  <!-- Preconnect for performance -->
  <link rel="preconnect" href="https://fonts.googleapis.com">
  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
  <link rel="preconnect" href="https://ajax.googleapis.com">
  <link rel="preconnect" href="https://documentcloud.adobe.com">
  <link rel="preconnect" href="https://cdn.jsdelivr.net">


  <title>ARGenSeg: Image Segmentation with Autoregressive Image Generation Model</title>

  <!-- Favicon and App Icons -->
  <link rel="icon" type="image/x-icon" href="static/images/favicon.ico">
  <link rel="apple-touch-icon" href="static/images/favicon.ico">

  <!-- Critical CSS - Load synchronously -->
  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <!-- Non-critical CSS - Load asynchronously -->
  <link rel="preload" href="static/css/bulma-carousel.min.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
  <link rel="preload" href="static/css/bulma-slider.min.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
  <link rel="preload" href="static/css/fontawesome.all.min.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
  <link rel="preload" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css" as="style" onload="this.onload=null;this.rel='stylesheet'">

  <!-- Fallback for browsers that don't support preload -->
  <noscript>
    <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
    <link rel="stylesheet" href="static/css/bulma-slider.min.css">
    <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  </noscript>

  <!-- Fonts - Optimized loading -->
  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap" rel="stylesheet">

  <!-- Defer non-critical JavaScript -->
  <script defer src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script defer src="static/js/bulma-carousel.min.js"></script>
  <script defer src="static/js/bulma-slider.min.js"></script>
  <script defer src="static/js/index.js"></script>
</head>


<body>
  <main id="main-content">
  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">ARGenSeg: Image Segmentation with Autoregressive Image Generation Model</h1>
            <div class="is-size-5 publication-authors">
              <span class="author-block">
                <a href="https://xlwang.site" target="_blank">Xiaolong Wang</a>,</span>
                <span class="author-block">
                  <a href="https://rulixiang.github.io/" target="_blank">Lixiang Ru</a>,</span>
                <span class="author-block">
                  <a href="https://huang-ziyuan.github.io/" target="_blank">Ziyuan Huang</a>,</span>
                <span class="author-block">
                  <a href="https://scholar.google.com/citations?user=PNTIf4gAAAAJ" target="_blank">Kaixiang Ji</a>,</span><br>
                <span class="author-block">
                  <a href="https://openreview.net/profile?id=~DanDan_Zheng1" target="_blank">Dandan Zheng</a>,</span>
                <span class="author-block">
                  <a href="https://scholar.google.com/citations?user=8SCEv-YAAAAJ&hl=en" target="_blank">Jingdong Chen</a>,</span>
                <span class="author-block">
                  <a href="https://scholar.google.com/citations?user=mCVvloEAAAAJ&hl=en" target="_blank">Jun Zhou</a>
                </span>
              </div>

              <div class="is-size-5 publication-authors">
                <span class="author-block">Ant Group<br><b>NeurIPS 2025</b></span>
              </div>

                  <div class="column has-text-centered">
                    <div class="publication-links">
                      <span class="link-block">
                        <a href="https://neurips.cc/virtual/2025/loc/san-diego/poster/115738" target="_blank"
                        class="external-link button is-normal is-rounded is-dark">
                        <span class="icon">
                          <i class="fas fa-file-pdf"></i>
                        </span>
                        <span>Paper</span>
                      </a>
                    </span>

                    <span class="link-block">
                      <a href="https://github.com/xlwangDev/ARGenSeg" target="_blank"
                      class="external-link button is-normal is-rounded is-dark">
                      <span class="icon">
                        <i class="fab fa-github"></i>
                      </span>
                      <span>Code Coming Soon...</span>
                    </a>
                  </span>

                  <span class="link-block">
                    <a href="https://arxiv.org/abs/2510.20803" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="ai ai-arxiv"></i>
                    </span>
                    <span>arXiv</span>
                  </a>
              </span>
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- Overview of ARGenSeg -->
<section class="section hero">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-full">
        <div class="content">
          <!-- <h2 class="title is-3 has-text-centered">Overview</h2> -->
          <center>
          <img src="https://cdn.bytez.com/mobilePapers/v2/neurips/115738/images/0-0.png" alt="Overview of ARGenSeg" class="center-image blend-img-background" width="100%"/>
          </center>
          <!-- <div class="level-set has-text-justified"> -->
          <div class="subtitle" style="font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; font-size: 1.1rem; line-height: 1.6; color: #2d3748; font-weight: 400; max-width: 1000px; margin: 0 auto; text-align: left;">
            <p>
              ARGenSeg is a unified framework for visual understanding, segmentation, and generation.
              It supports semantic, instance, interactive, and zero-shot reasoning segmentation, as well as anomaly detection, by leveraging strong visual understanding capabilities.
            </p>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<!--
<section class="section hero">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <img src="https://cdn.bytez.com/mobilePapers/v2/neurips/115738/images/0-0.png" alt="MY ALT TEXT" , width="900" />
      <h2 class="subtitle" style="font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; font-size: 1.1rem; line-height: 1.6; color: #2d3748; font-weight: 400; max-width: 1000px; margin: 0 auto; text-align: left;">
        ARGenSeg is a unified framework for visual understanding, segmentation, and generation.
        It supports semantic, instance, interactive, and zero-shot reasoning segmentation, as well as anomaly detection, by leveraging strong visual understanding capabilities.
      </h2>
    </div>
  </div>
</section>
-->

<!-- Paper abstract -->
<section class="section hero is-light">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            We propose a novel AutoRegressive Generation-based paradigm for image Segmentation (ARGenSeg), achieving multimodal understanding and pixel-level perception within a unified framework. Prior works integrating image segmentation into multimodal large language models (MLLMs) typically employ either boundary points representation or dedicated segmentation heads. These methods rely on discrete representations or semantic prompts fed into task-specific decoders, which limits the ability of the MLLM to capture fine-grained visual details. To address these challenges, we introduce a segmentation framework for MLLM based on image generation, which naturally produces dense masks for target objects. We leverage MLLM to output visual tokens and detokenize them into images using an universal VQ-VAE, making the segmentation fully dependent on the pixel-level understanding of the MLLM. To reduce inference latency, we employ a next-scale-prediction strategy to generate required visual tokens in parallel. Extensive experiments demonstrate that our method surpasses prior state-of-the-art approaches on multiple segmentation datasets with a remarkable boost in inference speed, while maintaining strong understanding capabilities.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End paper abstract -->

<section class="section hero is-small">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-full">
        <div class="content">
          <h2 class="title is-3">Model Architecture and Workflow</h2>
          <!-- <h2 class="title is-4">Model Architecture and Its Training and Inference Procedures</h2> -->
          <img src="https://cdn.bytez.com/mobilePapers/v2/neurips/115738/images/3-16.png" alt="MY ALT TEXT" , width="95%" style="display: block; margin: 0 auto;"/>
          <!-- <center>
          <img src="static/images/architecture_finetune.png" alt="Mixed Video-Image Finetuning" class="center-image blend-img-background"/>
          </center> -->
          <div class="level-set has-text-justified">
            <p>
              <strong>Left:</strong> ARGenSeg integrates image segmentation into the MLLM via an autoregressive image generation paradigm. A unified classification prediction head is used to generate both text and visual tokens.<br>
              <strong>Right:</strong> Visual tokens are generated in parallel using the next-scale prediction strategy. During training, a VAE encoder is used to construct supervision for cross-entropy loss. During inference, the VAE decoder reconstructs the image from the predicted visual tokens.
              [S]/[E] denotes &lt;gen_start&gt;/&lt;gen_end&gt;.
            </p>
          </div>
          <!-- <h3 class="title is-4">Core Mechanism: Direct Token Prediction</h3>
          <p>
            We use a pre-trained VQ-VAE as the image tokenizer, which quantizes images into discrete visual token IDs.
            These visual token IDs are added to the LLM's vocabulary. To perform segmentation, the model is trained to generate these quantized visual tokens directly, aligning with the next-token autoregressive prediction mechanism of language models.
            The image tokenizer and vision encoder are kept <strong>frozen</strong> during training to ensure the model learns pixel-level information without relying on subsequent dedicated decoders.
          </p>

          <h3 class="title is-4">Efficiency through Next-Scale Prediction</h3>
          <p>
            To ensure fast response times required by segmentation tasks, ARGenSeg adopts the <strong>next-scale prediction strategy</strong>.
              The feature map is quantized into K multi-scale token maps (r<sub>1</sub>, r<sub>2</sub>, ..., r<sub>K</sub>).
              At each inference step, the model generates all tokens required for the current scale in <strong>parallel</strong>.
              This multi-scale process, which typically uses K=10 scales, naturally aligns with the intuitive process of object segmentation: <strong>coarse localization followed by fine-grained boundary refinement</strong>.
          </p> -->
        </div>
      </div>
    </div>
  </div>
</section>


<!--
<section class="section hero">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <img src="https://cdn.bytez.com/mobilePapers/v2/neurips/115738/images/3-16.png" alt="MY ALT TEXT" , width="900" />
      <h2 class="subtitle" style="font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; font-size: 1.1rem; line-height: 1.6; color: #2d3748; font-weight: 400; max-width: 1000px; margin: 0 auto; text-align: left;">
        The architecture of ARGenSeg and its training and inference procedures.<br>
        <strong>Left:</strong> ARGenSeg integrates image segmentation into the MLLM via an autoregressive image generation paradigm. A unified classification prediction head is used to generate both text and visual tokens.
        <strong>Right:</strong> Visual tokens are generated in parallel using the next-scale prediction strategy. During training, a VAE encoder is used to construct supervision for cross-entropy loss. During inference, the VAE decoder reconstructs the image from the predicted visual tokens.
        [S]/[E] denotes &lt;gen_start&gt;/&lt;gen_end&gt;.
      </h2>
    </div>
  </div>
</section>
-->

<!-- Methodology -->
<section class="section hero">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-full">
        <h2 class="title is-3">Experiments</h2>
        <div class="content has-text-left">
          <h3 class="title is-4">Quantitative Comparisons</h3>
          <p>
            ARGenSeg consistently outperforms prior state-of-the-art methods on standard Referring Expression Segmentation (RES) benchmarks (RefCOCO/+/g).
            Notably, ARGenSeg achieves superior performance while using significantly less segmentation data (402K samples) compared to HiMTok (2.91M samples).
          </p>
          <img src="https://cdn.bytez.com/mobilePapers/v2/neurips/115738/images/5-0.png" alt="MY ALT TEXT" , width="90%" style="display: block; margin: 0 auto;"/>

          <p>
            <br>
            In a direct comparison with the single-scale visual tokenizer (pre-trained VQ-GAN from Janus), our multi-scale approach shows clear superiority.
            Reporting on the validation split with gIoU as the metric, our method is proven to be both significantly faster and more robust, a benefit of its coarse-to-fine refinement strategy.
            <!-- We compare our multi-scale visual tokenizer with a single-scale version, implemented using the pre-trained VQ-GAN from Janus.
            As shown in bellow, our multi-scale approach not only offers significant speed improvements, but also enhances robustness through a coarse-to-fine refinement process.  -->
          </p>
          <img src="https://cdn.bytez.com/mobilePapers/v2/neurips/115738/images/8-2.png" alt="MY ALT TEXT" , width="90%" style="display: block; margin: 0 auto;"/>
          <h3 class="title is-4">Qualitative Results</h3>

          <img src="https://cdn.bytez.com/mobilePapers/v2/neurips/115738/images/6-1.png" alt="MY ALT TEXT" , width="95%" style="display: block; margin: 0 auto;"/>
          <p>
            <br>
            ARGenSeg first localizes the target object and then progressively refines its boundaries.
          </p>
          <img src="https://cdn.bytez.com/mobilePapers/v2/neurips/115738/images/7-1.png" alt="MY ALT TEXT" , width="95%" style="display: block; margin: 0 auto;"/>
          <p>
            <br>
            <strong>Top:</strong> Visualization of interactive segmentation. Points and scribbles are provided as visual prompts, while bounding boxes are input via text.<br>
            <strong>Bottom:</strong> Visualization results of instruction-based image generation. The model is trained on image generation data for only 50k iterations.          </p>

        </div>
      </div>
    </div>
  </div>
</section>
<!-- End Methodology -->

<section class="section hero is-small">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-full">
        <div class="content">
          <h2 class="title is-3">Poster</h2>
          <!-- <h2 class="title is-4">Model Architecture and Its Training and Inference Procedures</h2> -->
          <img src="https://neurips.cc/media/PosterPDFs/NeurIPS%202025/115738.png?t=1762512093.3242867" alt="MY ALT TEXT" , width="100%"" style="display: block; margin: 0 auto;"/>
        </div>
      </div>
    </div>
  </div>
</section>

<!--BibTex citation -->
  <section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <div class="bibtex-header">
        <h2 class="title">BibTeX</h2>
        <button class="copy-bibtex-btn" onclick="copyBibTeX()" title="Copy BibTeX to clipboard">
          <i class="fas fa-copy"></i>
          <span class="copy-text">Copy</span>
        </button>
      </div>
      <pre id="bibtex-code"><code>
        @article{wang2025argenseg,
          title={ARGenSeg: Image Segmentation with Autoregressive Image Generation Model},
          author={Wang, Xiaolong and Ru, Lixiang and Huang, Ziyuan and Ji, Kaixiang and Zheng, Dandan and Chen, Jingdong and Zhou, Jun},
          journal={arXiv preprint arXiv:2510.20803},
          year={2025}
        }
      </code></pre>
    </div>
</section>
<!--End BibTex citation -->


  <footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">

          <p>
            This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
            <br> This website is licensed under a <a rel="license"  href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>

        </div>
      </div>
    </div>
  </div>
</footer>

<!-- Statcounter tracking code -->

<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

    <!-- End of Statcounter Code -->

</body>
</html>