"Generalized Decoding for Pixel, Image, and Language" [1] shortened as X-Decoder is a generalized decoding system that is applicable to various vision-language tasks with a unified architecture. The input and output modalities spans vision-language while the granularity spans pixel-image.
"Segment Everything Everywhere All-at-Once" [2] denoted as SEEM in extending X-Decoder with human interaction capability, where human can refer a segment with scribble, box, point, and etc. Meanwhile, it accepts prompts span vision-language in a composite manner.
@article{zou2023xdecoder,
author = {Xueyan Zou*, Zi-Yi Dou*, Jianwei Yang*, Zhe Gan, Linjie Li, Chunyuan Li, Xiyang Dai, Harkirat Behl, Jianfeng Wang, Lu Yuan, Nanyun Peng, Lijuan Wang, Yong Jae Lee*, Jianfeng Gao*},
title = {Generalized Decoding for Pixel, Image and Language},
publisher = {CVPR},
year = {2023},
}
@article{zou2023seem,
author = {Xueyan Zou*, Jianwei Yang*, Hao Zhang*, Feng Li*, Linjie Li, Jianfeng Wang, Lijuan Wang, Jianfeng Gao*, Yong Jae Lee*},
title = {Segment everything everywhere all at once},
publisher = {NeurIPS},
year = {2023},
}
@article{zou2023find,
author = {Xueyan Zou, Linjie Li, Jianfeng Wang, Jianwei Yang, Mingyu Ding, Zhengyuan Yang, Feng Li, Hao Zhang, Shilong Liu, Arul Aravinthan, Yong Jae Lee*, Lijuan Wang*},
title = {Generalized Decoding for Pixel, Image and Language},
publisher = {arXiv},
year = {2023},
}
This website is adapted from Nerfies, licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.