-
Notifications
You must be signed in to change notification settings - Fork 0
/
paper.html
1237 lines (1018 loc) · 80.8 KB
/
paper.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html>
<head>
<!-- Standard Meta -->
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0">
<!-- Site Properties -->
<title>Airbert - ICCV 2021</title>
<!-- SEO -->
<meta property="og:title" content="Airbert: In-domain Pretraining for Vision-and-Language Navigation" />
<meta property="og:type" content="article" />
<meta property="og:description" content="SOTA in multiple VLN tasks by pre-training on Airbnb" />
<meta property="og:image" content="https://airbert-vln.github.io/assets/img/teaser.jpeg" />
<meta property="og:url" content="https://airbert-vln.github.io/" />
<!-- Twitter Card data -->
<meta name="twitter:card" content="summary" />
<meta name="twitter:title" content="Airbert: In-domain Pretraining for Vision-and-Language Navigation" />
<meta name="twitter:description" content="SOTA in multiple VLN tasks by pre-training on Airbnb" />
<meta name="twitter:image" content="https://airbert-vln.github.io/assets/img/teaser_square.jpeg" />
<!-- You MUST include jQuery before Fomantic -->
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/jquery.min.js"></script>
<link rel="stylesheet" type="text/css" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/semantic.min.css">
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/semantic.min.js"></script>
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<style type="text/css">
.hidden.menu {
display: none;
}
.masthead.segment {
min-height: 700px;
padding: 1em 0em;
}
.masthead .logo.item img {
margin-right: 1em;
}
.masthead .ui.menu .ui.button {
margin-left: 0.5em;
}
.masthead h1.ui.header {
margin-top: 3em;
margin-bottom: 0em;
font-size: 4em;
font-weight: normal;
}
.masthead h2 {
font-size: 1.7em;
font-weight: normal;
}
/
.ui.vertical.stripe {
padding: 8em 0em;
}
.ui.vertical.stripe h3 {
font-size: 2em;
}
.ui.vertical.stripe .button + h3,
.ui.vertical.stripe p + h3 {
margin-top: 3em;
}
.ui.vertical.stripe .floated.image {
clear: both;
}
.ui.vertical.stripe p {
font-size: 1.33em;
}
.ui.vertical.stripe .horizontal.divider {
margin: 3em 0em;
}
.quote.stripe.segment {
padding: 0em;
}
.quote.stripe.segment .grid .column {
padding-top: 5em;
padding-bottom: 5em;
}
.footer.segment {
padding: 5em 0em;
}
.secondary.pointing.menu .toc.item {
display: none;
}
@media only screen and (max-width: 700px) {
.ui.fixed.menu {
display: none !important;
}
.secondary.pointing.menu .item,
.secondary.pointing.menu .menu {
display: none;
}
.secondary.pointing.menu .toc.item {
display: block;
}
.masthead.segment {
min-height: 350px;
}
.masthead h1.ui.header {
font-size: 2em;
margin-top: 1.5em;
}
.masthead h2 {
margin-top: 0.5em;
font-size: 1.5em;
}
}
p {
text-align: justify;
font-size: 12pt;
}
.masthead {
background-image: url('/assets/img/bg3.jpg') !important;
background-size: cover !important;
}
.masthead.segment {
min-height: 300px;
}
.masthead h1.ui.header {
margin-top: 0em;
}
.masthead .ui.tex a {
margin-bottom: 40px;
}
.masthead a {
color: #EEE;
}
.ui.small.table {
font-size: .8em;
}
</style>
<script>
$(document)
.ready(function() {
// fix menu when passed
$('.masthead')
.visibility({
once: false,
onBottomPassed: function() {
$('.fixed.menu').transition('fade in');
},
onBottomPassedReverse: function() {
$('.fixed.menu').transition('fade out');
}
})
;
// create sidebar and attach to menu open
$('.ui.sidebar')
.sidebar('attach events', '.toc.item')
;
})
;
</script>
</head>
<body>
<!-- Following Menu -->
<div class="ui large top fixed hidden menu">
<div class="ui container">
<a href="index.html" class="item">
<i class="home icon"></i>Home
</a>
<a href="demo.html" class="item">
<i class="robot icon"></i>Demo
</a>
<a href="paper.html" class="active item">
<i class="book icon"></i>Paper
</a>
<a href="https://arxiv.org/abs/2108.09105" class="item">
<i class="glasses icon"></i>arXiv
</a>
<a href="bibtex.txt" class="item">
<i class="quote right icon"></i>BibTeX
</a>
<a href="https://github.com/airbert-vln" class="item">
<i class="github icon"></i>GitHub
</a>
<a href="https://www.youtube.com/watch?v=veND1vIkdm" class="item">
<i class="youtube icon"></i>Video
</a>
</div>
</div>
<!-- Sidebar Menu -->
<div class="ui vertical inverted sidebar menu">
<a href="index.html" class="item">
<i class="home icon"></i>Home
</a>
<a href="demo.html" class="item">
<i class="robot icon"></i>Demo
</a>
<a href="paper.html" class="active item">
<i class="book icon"></i>Paper
</a>
<a href="https://arxiv.org/abs/2108.09105" class="item">
<i class="glasses icon"></i>arXiv
</a>
<a href="bibtex.txt" class="item">
<i class="quote right icon"></i>BibTeX
</a>
<a href="https://github.com/airbert-vln" class="item">
<i class="github icon"></i>GitHub
</a>
<a href="https://www.youtube.com/watch?v=veND1vIkdm" class="item">
<i class="youtube icon"></i>Video
</a>
</div>
<!-- Page Contents -->
<div class="pusher">
<div class="ui inverted vertical masthead center aligned segment">
<div class="ui large secondary inverted pointing menu">
<div class="ui container">
<a class="toc item">
<i class="sidebar icon"></i>
</a>
<a href="index.html" class="item">
<i class="home icon"></i>Home
</a>
<a href="demo.html" class="item">
<i class="robot icon"></i>Demo
</a>
<a href="paper.html" class="active item">
<i class="book icon"></i>Paper
</a>
<a href="https://arxiv.org/abs/2108.09105" class="item">
<i class="glasses icon"></i>arXiv
</a>
<a href="bibtex.txt" class="item">
<i class="quote right icon"></i>BibTeX
</a>
<a href="https://github.com/airbert-vln" class="item">
<i class="github icon"></i>GitHub
</a>
<a href="https://www.youtube.com/watch?v=veND1vIkdm" class="item">
<i class="youtube icon"></i>Video
</a>
</div>
</div>
<div class="ui text container">
<h1 class="ui inverted header">
Airbert
</h1>
<h2>
In-domain Pretraining for Vision-and-Language Navigation
</h2>
<h4>
<a href="https://www.linkedin.com/in/pierre-louis-guhur-51130495/">Pierre-Louis Guhur</a> <sup> 🏠</sup>,
<a href="https://makarandtapaswi.github.io/">Makarand Tapaswi</a> <sup>🏠, 🏢 </sup> ,
<a href="https://cshizhe.github.io/">Shizhe Chen</a> <sup>🏠</sup>,
<a href="https://www.di.ens.fr/~laptev/">Ivan Laptev <sup> 🏠</sup></a>,
<a href="https://www.di.ens.fr/willow/people_webpages/cordelia/">Cordelia Schmid</a>
<sup> 🏠, 🛖 </sup>
</h4>
<h4>
🏠
<a href="https://www.inria.fr"> Inria Paris</a>,
🏢
<a href=https://www.iiit.ac.in"> IIIT Hyderabad</a>,
🛖
<a href="https://research.google">Google Research</a>
</h4>
</div>
<!-- <script>
$('.ui.embed').embed({
url: '/assets/video/bg.mp4',
autoplay: "true",
});
</script>
-->
</div>
<div class="ui segment" style="border-top: none">
<div class="ui text container">
<h1 class="ui header">Abstract</h1>
<p>
Vision-and-language navigation (VLN) aims to enable embodied agents to navigate in realistic environments using natural language instructions.
Given the scarcity of domain-specific training data and the high diversity of image and language inputs, the generalization of VLN agents to unseen environments remains challenging.
</p>
<p>
Recent methods explore pretraining to improve generalization, however, the use of generic image-caption datasets
or existing small-scale VLN environments
is suboptimal and results in limited improvements.
</p>
<p>
In this work, we introduce <a href="https://github.com/airbert-vln/bnb-dataset/">BnB</a>, a large-scale and diverse in-domain VLN dataset.
We first collect image-caption (IC) pairs from hundreds of thousands of listings from online rental marketplaces.
Using IC pairs we next propose automatic strategies to generate millions of VLN path-instruction (PI) pairs.
We further propose a shuffling loss that improves the learning of temporal order inside PI pairs.
</p>
<p>
We use <a href="https://github.com/airbert-vln/bnb-dataset/">BnB</a> to pretrain our <a href="https://github.com/airbert-vln/airbert/">Airbert</a> model that can be adapted to discriminative and generative settings and show that it outperforms state of the art for <a href="https://bringmeaspoon.org/">Room-to-Room (R2R)</a> navigation and <a href="https://arxiv.org/abs/1904.10151">Remote Referring Expression (REVERIE)</a> benchmarks.
Moreover, our in-domain pretraining significantly increases performance on a challenging few-shot VLN evaluation, where we train the model only on VLN instructions from a few houses.
</p>
<a class="ui primary large button">Read the paper</a>
<a class="ui primary basic large button">Supplementary material</a>
<div class="ui segment" id="intro">
<h1 class="ui header">1. Introduction</h1>
<figure>
<img src="/assets/img/teaser.svg" alt="VLN tasks are evaluated on unseen environments at test time. Top: None of the training houses contain a Christmas theme making this test environment particularly challenging. Bottom: We build a large-scale, visually diverse, and in-domain dataset by creating path-instruction pairs close to a VLN-like setup and show the benefits of self-supervised pretraining." /><figcaption>Figure 1: VLN tasks are evaluated on unseen environments at test time. <em>Top</em>: None of the training houses contain a Christmas theme making this test environment particularly challenging. <em>Bottom</em>: We build a large-scale, visually diverse, and in-domain dataset by creating path-instruction pairs close to a VLN-like setup and show the benefits of self-supervised pretraining.</figcaption>
</figure>
<p>In vision-and-language navigation (VLN), an agent is asked to navigate in home environments following natural language instructions <span class="citation" data-cites="anderson2018evaluation anderson2018r2r">[1], [2]</span>. This task is attractive to many real-world applications such as domestic robotics and personal assistants. However, given the high diversity of VLN data across environments and the difficulty of the manual collection and annotation of VLN training data at scale, the performance of current methods remains limited, especially for previously unseen environments <span class="citation" data-cites="zhangdiagnosing">[3]</span>.</p>
<p>Our work is motivated by significant improvements in vision and language pretraining <span class="citation" data-cites="alberti2019b2t2 chen2020uniter li2020oscar lu2019vilbert lu2020_12in1 su2019vlbert">[4]–[9]</span>, where deep transformer models <span class="citation" data-cites="vaswani2017attention">[10]</span> are trained via self-supervised proxy tasks <span class="citation" data-cites="devlin2018bert">[11]</span> using large-scale, automatically harvested image-text datasets <span class="citation" data-cites="ordonez2011sbu ConceptualCaptions">[12], [13]</span>. Such pretraining enables learning transferable multi-modal representations achieving state-of-the-art performance in various vision and language tasks. Similarly, with the goal of learning an embodied agent that generalizes, recent works <span class="citation" data-cites="hao2020prevalent huang2019transferable li2019press majumdar2020vlnbert">[14]–[17]</span> have explored different pretraining approaches for VLN tasks.</p>
<p>In <span class="citation" data-cites="hao2020prevalent huang2019transferable">[14], [15]</span>, annotated path-instruction pairs are augmented with a <em>speaker</em> model that generates instructions for random unseen paths. However, as these paths originate from a small set of 61 houses used during training, they are limited in visual diversity. The limited pretraining environments do not equip agents with visual understanding abilities that enable generalization to unseen houses, see Fig. <a href="#fig:teaser" data-reference-type="ref" data-reference="fig:teaser">1</a>. To address this problem, VLN-BERT <span class="citation" data-cites="majumdar2020vlnbert">[17]</span> proposes to pretrain the agent on generic image-caption datasets that are abundant and cover diverse visio-linguistic knowledge. However, these image-caption pairs are quite different from the dynamic visual stream (path) and navigable instructions observed by a VLN agent, and such out-of-domain pretraining, although promising, only brings limited gains to the navigation performance. Besides the above limitations, existing pretraining methods do not place much emphasis on temporal reasoning abilities in their self-supervised proxy tasks such as one-step action prediction <span class="citation" data-cites="hao2020prevalent">[14]</span> and path-instruction pairing <span class="citation" data-cites="majumdar2020vlnbert">[17]</span>, while such reasoning is important to a sequential decision making task like VLN. As a result, even if performance in downstream tasks is improved, the pretrained models may still be brittle. For example, a simple corruption of instructions by swapping noun phrases within the instruction, or replacing them with other nouns, leads to significant confusion as models are unable to pick the correct original pair.</p>
<p>In this paper, we explore a different data source and proxy tasks to address the above limitations in pretraining a generic VLN agent. Though navigation instructions are rarely found on the Internet, image-caption pairs from home environments are abundant in online marketplaces (<em>e.g</em>.. <em>Airbnb</em>), which include images and descriptions of rental listings. We collect BnB, a new large-scale dataset with 1.4M indoor images and 0.7M captions. First, we show that in-domain image-caption pairs bring additional benefits for downstream VLN tasks when applied with generic web data <span class="citation" data-cites="majumdar2020vlnbert">[17]</span>. In order to further reduce the domain gap between the BnB pretraining and the VLN task, we present an approach to transform static image-caption pairs into visual paths and navigation-like instructions (Fig. <a href="#fig:teaser" data-reference-type="ref" data-reference="fig:teaser">1</a> bottom), leading to large additional performance gains. We also propose a shuffling loss that improves the model’s temporal reasoning abilities by learning a temporal alignment between a path and the corresponding instruction.</p>
<p>Our pretrained model, Airbert, is a generic transformer backbone that can be readily integrated in both discriminative VLN tasks such as path-instruction compatibility prediction <span class="citation" data-cites="majumdar2020vlnbert">[17]</span> and generative VLN tasks <span class="citation" data-cites="hong2021recurrentvln">[18]</span> in R2R navigation <span class="citation" data-cites="anderson2018r2r">[2]</span> and REVERIE remote referring expression <span class="citation" data-cites="qi2020reverie">[19]</span>. We achieve state-of-the-art performance on these VLN tasks with our pretrained model. Beyond the standard evaluation, our in-domain pretraining opens an exciting new direction of <em>one/few-shot VLN</em> where the agent is trained on examples only from one/few environment(s) and expected to generalize to other unseen environments.</p>
<p>In summary, the contributions of this work are three-fold. (1) We collect a new large-scale in-domain dataset, BnB, to promote pretraining for vision-and-language navigation tasks. (2) We curate the dataset in different ways to reduce the distribution shift between pretraining and VLN and also propose the shuffling loss to improve temporal reasoning abilities. (3) Our pretrained Airbert can be plugged into generative or discriminative architectures and achieves state-of-the-art performance on R2R and REVERIE datasets. Moreover, our model generalizes well under a challenging one/few-shot VLN evaluation, truly highlighting the capabilities of our learning paradigm. We will release the code, model, and data.</p>
</div>
<div class="ui segment" id="bnb_dataset">
<h1 class="ui header">2. Related Work</h1>
<p><strong>Vision-and-language navigation.</strong> VLN <span class="citation" data-cites="anderson2018r2r">[2]</span> has received significant attention with a large number of followup tasks introduced in recent years <span class="citation" data-cites="anderson2018evaluation chen2019touchdown krantz2020r2rce ku2020rxr nguyen2019hanna nguyen2019vlna qi2020reverie shridhar2020alfred thomason2020cvdn">[1], [19]–[26]</span>. Early days of VLN saw the use of sequence-to-sequence LSTMs to predict low-level actions <span class="citation" data-cites="anderson2018r2r">[2]</span> or high-level directions in a panoramic action space <span class="citation" data-cites="fried2018speaker">[27]</span>. For better cross-modal alignment, a visio-linguistic co-grounding attention mechanism is proposed in <span class="citation" data-cites="ma2019self">[28]</span>, and instructions are further disentangled into objects and directions in <span class="citation" data-cites="qi2020object">[29]</span>. To alleviate exposure bias in supervised training of the agent, reinforcement learning has been adopted through planning <span class="citation" data-cites="wang2018look">[30]</span>, REINFORCE <span class="citation" data-cites="wang2019reinforced">[31]</span>, A2C <span class="citation" data-cites="tan2019envdrop">[32]</span> and reward learning <span class="citation" data-cites="wang2020serl">[33]</span>. A few works also explore different search algorithms such as backtracking by monitoring progress <span class="citation" data-cites="ma2019self ma2019regretful">[28], [34]</span> or beam search <span class="citation" data-cites="fried2018speaker ke2019tactical tan2019envdrop">[27], [32], [35]</span> in environment exploration.</p>
<p>To improve an agent’s generalization to unseen environments, data augmentation is performed by using a <em>speaker</em> model <span class="citation" data-cites="fried2018speaker">[27]</span> that generates instructions for random paths in seen environments, and environment dropout <span class="citation" data-cites="tan2019envdrop">[32]</span> is used to mimic new environments. While pretraining LSTMs to learn vision and language representations is adopted by <span class="citation" data-cites="huang2019transferable">[15]</span>, recently, there has been a shift towards transformer models <span class="citation" data-cites="hao2020prevalent">[14]</span> to learn generic multimodal representations. This is further extended to a recurrent model that significantly improves sequential action prediction <span class="citation" data-cites="hong2021recurrentvln">[18]</span>. However, the limited environments in pretraining <span class="citation" data-cites="hao2020prevalent huang2019transferable">[14], [15]</span> constrain the generalization ability to unseen scenarios. Most related to this work, VLN-BERT <span class="citation" data-cites="majumdar2020vlnbert">[17]</span> transfers knowledge from abundant, but out-of-domain image-text data to improve path-instruction matching. In this work, we not only create a large-scale, in-domain BnB dataset to improve visual diversity, but also propose effective pretraining strategies to mitigate the domain-shift between webly crawled image-text pairs and VLN data.</p>
<p><strong>Large-scale visio-linguistic pretraining.</strong> Thanks to large-scale vision-language pairs automatically collected from the web <span class="citation" data-cites="miech2019howto100m ordonez2011sbu radford2021learning ConceptualCaptions">[12], [13], [36], [37]</span>, visio-linguistic pretraining (VLP) has made great breakthroughs in recent years towards learning transferable multimodal representations. Several VLP models <span class="citation" data-cites="chen2020uniter li2020oscar lu2019vilbert tan2019lxmert">[5]–[7], [38]</span> have been proposed based on the transformer architecture <span class="citation" data-cites="vaswani2017attention">[10]</span>. These models are often pretrained with self-supervised objectives akin to those in BERT <span class="citation" data-cites="devlin2018bert">[11]</span>: masked language modeling, masked region modeling and vision-text pairing. Fine-tuning them on downstream datasets achieves state-of-the-art performance on various VL tasks <span class="citation" data-cites="antol2015vqa kazemzadeh2014referitgame wang2016learning vinyals2016show">[39]–[42]</span>. While such pretraining focuses on learning correlations between vision and text, it is not designed for sequential decision making as required in embodied VLN. The goal of this work is not to improve VLP architectures but to present in-domain training strategies that lead to performance improvements for VLN tasks.</p>
</div>
<div class="ui segment" id="bnb_dataset">
<div class="left ui rail" style="">
<p> The number images from Matterport environments <span class="citation" data-cites="Matterport3D">[44]</span> refers to the number of panoramas. The speaker model <span class="citation" data-cites="tan2019envdrop">[32]</span> generates instructions for randomly selected trajectories, but is limited to panoramas from 60 training environments. Note that the data from Conceptual Captions (ConCaps) may feature some houses, but it is not the main category. </p>
<div class="ui sticky">
<h4 class="ui header" id="tab:bnb_dataset_cmpr">Table 1: Comparing BnB to other existing VLN datasets</h4>
<table class="ui small striped table" style="table-layout: fixed">
<thead>
<tr class="header">
<th style="text-align: left;">Dataset</th>
<th style="text-align: left;">Source</th>
<th style="text-align: center;">#Envs</th>
<th style="text-align: center;">#Imgs</th>
<th style="text-align: center;">#Texts</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td style="text-align: left;">R2R <span class="citation" data-cites="anderson2018r2r">[2]</span></td>
<td style="text-align: left;">Matterport</td>
<td style="text-align: center;">90</td>
<td style="text-align: center;">10.8K</td>
<td style="text-align: center;">21.7K</td>
</tr>
<tr class="even">
<td style="text-align: left;">REVERIE <span class="citation" data-cites="qi2020reverie">[19]</span></td>
<td style="text-align: left;">Matterport</td>
<td style="text-align: center;">86</td>
<td style="text-align: center;">10.6K</td>
<td style="text-align: center;">10.6K</td>
</tr>
<tr class="odd">
<td style="text-align: left;">Speaker <span class="citation" data-cites="tan2019envdrop">[32]</span></td>
<td style="text-align: left;">Matterport</td>
<td style="text-align: center;">60</td>
<td style="text-align: center;">7.8K</td>
<td style="text-align: center;">0.2M</td>
</tr>
<tr class="even">
<td style="text-align: left;">ConCaps <span class="citation" data-cites="ConceptualCaptions">[13]</span></td>
<td style="text-align: left;">Web</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">3.3M</td>
<td style="text-align: center;">3.3M</td>
</tr>
<tr class="odd">
<td style="text-align: left;"><strong>BnB</strong> (ours)</td>
<td style="text-align: left;">Airbnb</td>
<td style="text-align: center;">140K</td>
<td style="text-align: center;">1.4M</td>
<td style="text-align: center;">0.7M</td>
</tr>
</tbody>
</table>
</div>
</div>
<h1 class="ui header">3. BnB Dataset </h1>
<p>Hosts that rent places on online marketplaces often upload attractive and unique photos along with descriptions. One such marketplace, <em>Airbnb</em>, has 5.6M listings from over 100K cities all around the world <span class="citation" data-cites="airbnb">[43]</span>. We propose to use this abundant and curated data for large-scale in-domain VLN pretraining. In this section, we first describe how we collect image-caption pairs from <em>Airbnb</em>. Then, we propose methods to transform images and captions into VLN-like path-instruction pairs to reduce the domain gap between webly crawled image-text pairs and VLN tasks (see Fig. <a href="#fig:dataset" data-reference-type="ref" data-reference="fig:dataset">2</a>).</p>
<figure>
<img class="ui fluid image" src="/assets/img/dataset.svg" />
<figcaption>Figure 2: We explore several strategies to automatically create navigation-like instructions from image-caption pairs. </figcaption>
</figure>
<h2 class="ui header">3.1. Collecting BnB Image-Caption Pairs</h2>
<p>
<strong>Collection process.</strong> We restrict our dataset to listings from the US (about 10% of <em>Airbnb</em>) to ensure high quality English captions and visual similarity with Matterport environments <span class="citation" data-cites="Matterport3D">[44]</span>. The data collection proceeds as follows: (1) obtain a list of locations from Wikipedia; (2) find listings in these locations by querying the <em>Airbnb</em> search engine; (3) download listings and their metadata; (4) remove <em>outdoors</em> images<a href="#fn3" class="footnote-ref" id="fnref3"><sup>3</sup></a> as classified by a ResNet model pretrained on Places365 <span class="citation" data-cites="zhou2017places">[45]</span>; and (5) remove invalid image captions such as emails, URLs and duplicates.</p>
<p><strong>Statistics.</strong> We downloaded almost 150k listings and their metadata (1/4 of the listings in the US) in step 3, leading to over 3M images and 1M captions. After data cleaning with steps 4 and 5, we obtain 713K image-caption pairs and 676K images without captions. Table <a href="#tab:bnb_dataset_cmpr" data-reference-type="ref" data-reference="tab:bnb_dataset_cmpr">2</a> compares our BnB dataset to other datasets used in previous works for VLN (pre-)training. It is larger than R2R <span class="citation" data-cites="anderson2018r2r">[2]</span>, REVERIE <span class="citation" data-cites="qi2020reverie">[19]</span> and includes a large diversity of rooms and objects, which is not the case for Conceptual Captions <span class="citation" data-cites="ConceptualCaptions">[13]</span>. We posit that such in-domain data is crucial to deal with the data scarcity challenge in VLN environments as illustrated <a href="#motivation">above</a>. We use 95% of our BnB dataset for training and the remaining 5% for validation.</p>
<p>Apart from images and captions, our collected listings contain structured data including a list of amenities, a general description, reviews, location, and rental price, which may offer additional applications in the future. More details about the dataset and examples are presented in the supplementary material.</p>
<h2 class="ui header">3.2. Creating BnB Path-Instruction Pairs</h2>
<p>
BnB image-caption (IC) pairs are complementary to Conceptual Captions (ConCaps) as they capture diverse VLN environments. However, they still have large differences from path-instruction (PI) pairs in VLN tasks. For example, during navigation, an agent observes multiple panoramic views of a sequence of locations rather than a single image, and the instruction may contain multiple sentences describing different locations along the way. To mitigate this domain gap, we propose strategies to automatically craft path-instruction pairs starting from BnB-IC pairs.</p>
<h3 class="ui header">Building path-instruction pairs</h3>
<p>Images in a BnB listing usually depict different locations in a house, mimicking the sequential visual observations an agent makes while navigating in the house. To create a VLN-like path-instruction pair, we randomly select and concatenate <span class="math inline"><em>K</em></span><a href="#fn4" class="footnote-ref" id="fnref4"><sup>4</sup></a> image-caption pairs from the listing . In between each caption, we randomly add a word from “<em>and</em>”, “<em>then</em>”, “.” or nothing to make the concatenated instruction more fluent and diverse.</p>
<h3 class="ui header">Augmenting <em>Paths</em> with Visual Contexts</h3>
<p>In the above concatenated path, each location only contains one BnB image, and perhaps with a limited view angle as hosts may focus on objects or amenities they wish to highlight. Therefore, it lacks the panoramic visual context at each location that the agent receives in real navigation paths. Moreover, each location in the concatenated instruction is described by a unique sentence, while adjacent locations are often expressed together in one sentence in VLN instructions <span class="citation" data-cites="hong2020fgr2r">[46]</span>. To address the above issues with concatenation, we propose two approaches to compose paths that have more visual context and can also leverage the abundant images without captions (denoted as <em>captionless images</em>).</p>
<p><strong>Image merging</strong> extends the panoramic context of a location by grouping images from similar room categories (see Fig. <a href="#fig:dataset" data-reference-type="ref" data-reference="fig:dataset">2</a>). For example, if the image depicts a kitchen sink, it is natural to expect images of other objects such as forks and knives nearby. Specifically, we first cluster images of similar categories (<em>e.g</em>.. <em>kitchen</em>) using room labels predicted by a pretrained Places365 model <span class="citation" data-cites="zhou2017places">[45]</span>. Then, we extract multiple regions from this <em>merged</em> set of images, and use them as an approximation to the panoramic visual representation.</p>
<p><strong>Captionless image insertion.</strong> The Table 1 shows that half of the BnB images are captionless. Using them allows to increase the size of the dataset. When creating a path-instruction pair from the concatenation approach, a captionless image is inserted as if its caption was an empty string. The BnB PI pairs hence generated better approximate the distribution of the R2R path-instructions: (1) some images in the path are not described and (2) instructions have similar number of noun phrases.</p>
<h3 class="ui header">Crafting <em>Instructions</em> with Fluent Transitions</h3>
<p>The concatenated captions mainly describe rooms or objects at different locations, but do not contain any of the actionable verbs as in navigation instructions, <em>e.g</em>.. “<em>turn left at the door</em>” or “<em>walk straight down the corridor</em>”. We suggest two strategies to create fake instructions that have fluent transitions between sentences.</p>
<p><strong>Instruction rephrasing.</strong> We use a fill-in-the-blanks approach to replace noun-phrases in human annotated navigation instructions <span class="citation" data-cites="anderson2018r2r">[2]</span> by those in BnB captions (see Fig. <a href="#fig:dataset" data-reference-type="ref" data-reference="fig:dataset">2</a>). Concretely, we create more than 10K instruction templates containing 2-7 blanks, and fill the blanks with noun-phrases extracted from BnB captions. The noun-phrases matched to object categories from the Visual Genome <span class="citation" data-cites="krishna2017vg">[47]</span> dataset are preferred during selection. This allows us to create VLN-like instructions with actionable verbs interspersed with room and object references for visual cues that are part of the BnB path (see Fig. <a href="#fig:dataset" data-reference-type="ref" data-reference="fig:dataset">2</a>).</p>
<p><strong>Instruction rephrasing.</strong> It is a video captioning like model that takes in a sequence of images and generates an instruction corresponding to an agent’s path through an environment. To train this model, we adopt ViLBERT and train it to generate captions for single BnB image-caption pairs. Further, this model is fine-tuned on trajectories of the R2R dataset to generate corresponding instructions. Finally, we use this model to generate BnB PI pairs by producing an instruction for a concatenated image sequence from BnB (the path).</p>
</div>
<script>
$('.ui.sticky')
.sticky({
context: '#bnb_dataset'
})
;
</script>
<div class="ui segment">
<h1 class="ui header">4. Airbert: A Pretrained VLN Model</h1>
<figure>
<img class="ui fluid image" src="/assets/img/pretraining.svg" />
<figcaption>
Figure 3: We explore several strategies to automatically create navigation-like instructions from image-caption pairs.
</figcaption>
</figure>
<p>In this section, we present Airbert, our multi-modal transformer pretrained on the BnB dataset with masking and shuffling losses. We first introduce the architecture of Airbert, and then describe datasets and pretext tasks in pretraining. Finally, we show how Airbert can be adapted to downstream VLN tasks.</p>
<h2 class="ui header">4.1. ViLBERT-like Architecture</h2>
<p> ViLBERT <span class="citation" data-cites="lu2019vilbert">[7]</span> is a multi-modal transformer extended from BERT <span class="citation" data-cites="devlin2018bert">[11]</span> to learn joint visio-linguistic representations from image-text pairs, as illustrated in Fig. <a href="#fig:model" data-reference-type="ref" data-reference="fig:model">3</a>.</p>
<p>Given an image-text pair <span class="math inline">(<em>V</em>, <em>C</em>)</span>, the model encodes the image as region features <span class="math inline">[<em>v</em><sub>1</sub>, …, <em>v</em><sub>𝒱</sub>]</span> via a pretrained Faster R-CNN <span class="citation" data-cites="anderson2017butd">[48]</span>, and embeds the text as a series of tokens: <span class="math inline">[<code>[CLS]</code>, <em>w</em><sub>1</sub>, …, <em>w</em><sub><em>T</em></sub>, <code>[SEP]</code>]</span>, where <code>[CLS]</code>and <code>[SEP]</code>are special tokens added to the text. ViLBERT contains two separate transformers that encode <span class="math inline"><em>V</em></span> and <span class="math inline"><em>C</em></span> and it learns cross-modal interactions via co-attention <span class="citation" data-cites="lu2019vilbert">[7]</span>.</p>
<p>We follow a similar strategy to encode path-instruction pairs (created in Sec. <a href="#sec:create_pi_pairs" data-reference-type="ref" data-reference="sec:create_pi_pairs">3.1</a>) that contain multiple images and captions <span class="math inline">{(<em>V</em><sub><em>k</em></sub>, <em>C</em><sub><em>k</em></sub>)}<sub><em>k</em> = 1</sub><sup><em>K</em></sup></span>. Here, each <span class="math inline"><em>V</em><sub><em>k</em></sub></span> is represented as visual regions <span class="math inline"><em>v</em><sub><em>i</em></sub><sup><em>k</em></sup></span> and <span class="math inline"><em>C</em><sub><em>k</em></sub></span> as word tokens <span class="math inline"><em>w</em><sub><em>t</em></sub><sup><em>k</em></sup></span>. Respectively, the visual and text inputs to Airbert are: <span> <br /><span class="math display">$$\begin{aligned}
X_V &= [\texttt{[IMG]}, v^1_1, \ldots, v^1_{\mathcal{V}_1}, \ldots, \texttt{[IMG]}, v^K_1, \ldots, v^K_{\mathcal{V}_K}], \\
X_C &= [\texttt{[CLS]}, w^1_1, \ldots, w^1_{T_1}, \ldots, w^K_1, \ldots, w^K_{T_K}, \texttt{[SEP]}] ,\end{aligned}$$</span><br /></span> where the <code>[IMG]</code> token is used to separate image region features taken at different locations.</p>
<p>Note that while our approach is not limited to a ViLBERT-like architecture, we choose ViLBERT for a fair comparison with previous work <span class="citation" data-cites="majumdar2020vlnbert">[15]</span>.</p>
<h2 class="ui header">4.2. Datasets and Pretext Tasks for Pretraining</h2>
<p>We use Conceptual Captions (ConCaps) <span class="citation" data-cites="ConceptualCaptions">[37]</span> and BnB-PI in subsequent pretraining steps (see Fig. <a href="#fig:model" data-reference-type="ref" data-reference="fig:model">[fig:model]</a>) to reduce the domain gap for downstream VLN tasks.</p>
<p>Previous multi-modal pretraining efforts <span class="citation" data-cites="lu2019vilbert majumdar2020vlnbert huang2019transferable">[7], [15], [17]</span> commonly use two self-supervised losses given image-caption (IC) pairs or path-instruction (PI) pairs: (1) <em>Masking</em> loss: An input image region or word is randomly replaced by a <code>[MASK]</code> token. The output feature of this masked token is trained to predict the region label or the word given its multi-modal context. (2) <em>Pairing</em> loss: Given the output features of <code>[IMG]</code>and <code>[CLS]</code> tokens, a binary classifier is trained to predict whether the image (path) and caption (instruction) are paired.</p>
<p>The above two pretext tasks mainly focus on learning object-word associations instead of reasoning about the temporal order of paths and instructions. For example, if an image <span class="math inline"><em>V</em><sub><em>i</em></sub></span> appears before <span class="math inline"><em>V</em><sub><em>j</em></sub></span>, then words from its caption <span class="math inline"><em>C</em><sub><em>i</em></sub></span> should appear before <span class="math inline"><em>C</em><sub><em>j</em></sub></span>. In order to promote such a temporal reasoning ability, we propose an additional <em>shuffling</em> loss to enforce alignment between PI pairs.</p>
<p>Given an aligned PI pair <span class="math inline"><em>X</em><sup>+</sup> = {(<em>V</em><sub><em>k</em></sub>, <em>C</em><sub><em>k</em></sub>)}<sub><em>k</em> = 1</sub><sup><em>K</em></sup></span>, we generate <span class="math inline">𝒩</span> negative pairs <span class="math inline"><em>X</em><sub><em>n</em></sub><sup>−</sup> = {(<em>V</em><sub><em>k</em></sub>, <em>C</em><sub><em>l</em></sub>)}, <em>k</em> ≠ <em>l</em></span>, by shuffling the composed images or the captions. We train our model to choose the aligned PI pair as compared to the shuffled negatives by minimizing the cross-entropy loss: <br /><span class="math display">$$L = -\log \frac{\exp(f(X^+))}{\exp(f(X^+)) + \sum_n \exp(f(X^-_n))} \, ,$$</span><br /> where <span class="math inline"><em>f</em>(<em>X</em>)</span> denotes the similarity score (logit) computed via Airbert for some PI pair <span class="math inline"><em>X</em></span>.</p>
<h2 class="ui header">4.3. Adaptations for Downstream VLN tasks</h2>
<p>We consider two VLN tasks: goal-oriented navigation (R2R <span class="citation" data-cites="anderson2018r2r">[2]</span>) and object-oriented navigation (REVERIE <span class="citation" data-cites="qi2020reverie">[19]</span>). Airbert can be readily integrated in discriminative and generative models for the above VLN tasks.</p>
<p>The navigation problem on the R2R dataset is formulated as a path selection task in <span class="citation" data-cites="majumdar2020vlnbert">[15]</span>. Several candidate paths are generated via beam search from a navigation agent such as <span class="citation" data-cites="tan2019envdrop">[32]</span>, and a discriminative model is trained to choose the best path among them. We fine-tune Airbert on the R2R dataset for path selection. A two-stage fine-tuning process is adopted: in the first phase, we use <em>masking</em> and <em>shuffling</em> losses on the PI pairs of the target VLN dataset in a manner similar to BnB PI pairs; in the second phase, we choose a positive candidate path as one that arrives within 3m of the goal, and contrast it against 3 negative candidate paths. We also compare multiple strategies to mine additional negative pairs (other than the 3 negative candidates), and in fact, empirically show that negatives created using shuffling outperform other options.</p>
<p><strong>Generative Model: Recurrent VLN-BERT <span class="citation" data-cites="hong2021recurrentvln">[18]</span>.</strong> The Recurrent VLN-BERT model adds recurrence to a state in the transformer to sequentially predict actions, achieving state-of-the-art performance on R2R and REVERIE tasks. We use our Airbert architecture as its backbone and apply it to the two tasks as follows. First, the language transformer encodes the instruction via self-attention. Then, the embedded <code>[CLS]</code> token in the instruction is used to track history and concatenated with visual tokens (observable navigable views or objects) in each action step. Self-attention and cross-attention on embedded instructions are employed to update the state and visual tokens and the attention score from the state token to visual tokens is used to decide the action at each step. We fine-tune the Recurrent VLN-BERT model with Airbert as the backbone in the same way as <span class="citation" data-cites="hong2021recurrentvln">[18]</span>.</p>
<p>Please refer to the supplementary material for additional details about the models and their implementation.</p>
</p>
</div>
<div class="ui segment" id="xp">
<h1 class="ui header">Experimental Results</h1>
<div class="left ui rail" style="">
<div class="ui sticky results">
<table class="ui small striped table">
<caption>Table 2: Comparison between various BnB PI pair creation strategies for pretraining. The first row denotes the use of image-caption pairs. All methods from the second row use masking and shuffling during pretraining. Cat: naive concatenation; Rep: instruction rephrasing; Gen: instruction generation; Merge: image merging; and Insert: captionless image insertion.</caption>
<tbody>
<tr class="even">
<td style="text-align: left;"></td>
<td style="text-align: center;"></td>
<td style="text-align: center;">Rep</td>
<td style="text-align: center;">Gen</td>
<td style="text-align: center;">Merge</td>
<td style="text-align: center;">Insert</td>
<td style="text-align: center;">Seen</td>
<td style="text-align: center;">Unseen</td>
</tr>
<tr class="odd">
<td style="text-align: left;">1</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">71.21</td>
<td style="text-align: center;">62.45</td>
</tr>
<tr class="even">
<td style="text-align: left;">2</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">73.84</td>
<td style="text-align: center;">62.71</td>
</tr>
<tr class="odd">
<td style="text-align: left;">3</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">72.67</td>
<td style="text-align: center;">63.35</td>
</tr>
<tr class="even">
<td style="text-align: left;">4</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">71.19</td>
<td style="text-align: center;">63.11</td>
</tr>
<tr class="odd">
<td style="text-align: left;">5</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">70.51</td>
<td style="text-align: center;">64.07</td>
</tr>
<tr class="even">
<td style="text-align: left;">6</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">74.43</td>
<td style="text-align: center;">66.05</td>
</tr>
<tr class="odd">
<td style="text-align: left;">7</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">73.57</td>
<td style="text-align: center;"><strong>66.52</strong></td>
</tr>
</tbody>
</table>
<table class="ui small striped table">
<caption>Table 4: Comparison between different strategies for fine-tuning a ViLBERT model on the R2R task. VLN-BERT <span class="citation" data-cites="majumdar2020vlnbert">[17]</span> fine-tunes ViLBERT with a masking and ranking loss. Each row (described in the text) is an independent data augmentation and can be compared directly against the baseline (row 1). </caption>
<tbody>
<tr class="odd">
<td style="text-align: left;"></td>
<td style="text-align: left;">Fine-tuning</td>
<td style="text-align: center;">Additional</td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
</tr>
<tr class="even">
<td style="text-align: left;"></td>
<td style="text-align: left;">Strategies</td>
<td style="text-align: center;">Negatives</td>
<td style="text-align: center;">Seen</td>
<td style="text-align: center;">Unseen</td>
</tr>
<tr class="odd">
<td style="text-align: left;">1</td>
<td style="text-align: left;">VLN-BERT <span class="citation" data-cites="majumdar2020vlnbert">[17]</span></td>
<td style="text-align: center;">0</td>
<td style="text-align: center;">70.20</td>
<td style="text-align: center;">59.26</td>
</tr>
<tr class="even">
<td style="text-align: left;">2</td>
<td style="text-align: left;">(1) + Wrong trajectories</td>
<td style="text-align: center;">2</td>
<td style="text-align: center;">70.11</td>
<td style="text-align: center;">59.11</td>
</tr>
<tr class="odd">
<td style="text-align: left;">3</td>
<td style="text-align: left;">(1) + Highlight keywords</td>
<td style="text-align: center;">0</td>
<td style="text-align: center;">71.89</td>
<td style="text-align: center;">61.37</td>
</tr>
<tr class="even">
<td style="text-align: left;">4</td>
<td style="text-align: left;">(1) + Hard negatives</td>
<td style="text-align: center;">2</td>
<td style="text-align: center;">71.89</td>
<td style="text-align: center;">61.63</td>
</tr>
<tr class="odd">
<td style="text-align: left;">5</td>
<td style="text-align: left;">(1) + Shuffling (Ours)</td>
<td style="text-align: center;">2</td>
<td style="text-align: center;">72.46</td>
<td style="text-align: center;"><strong>61.98</strong></td>
</tr>
</tbody>
</table>
</div>
</div>
<div class="right ui rail" style="">
<div class="ui sticky results">
<table class="ui small striped table">
<caption>Table 3: Impact of shuffling during pretraining and fine-tuning. While additional data helps, we see that using the shuffling loss (abbreviated as Shuf.) consistently improves model performance. Row 1 corresponds to VLN-BERT <span class="citation" data-cites="majumdar2020vlnbert">[17]</span>. </caption>
<tbody>
<thead class="even">
<th style="text-align: left;"></th>
<th style="text-align: center;">Mask</th>
<th style="text-align: center;">Shuf.</th>
<th style="text-align: center;">Rank</th>
<th style="text-align: center;">Shuf.</th>
<th style="text-align: center;">Rank</th>
<th style="text-align: center;">Shuf.</th>
<th style="text-align: center;">Seen</th>
<th style="text-align: center;">Unseen</th>
</thead>
<tr class="odd">
<td style="text-align: left;">1</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">70.20</td>
<td style="text-align: center;">59.26</td>
</tr>
<tr class="even">
<td style="text-align: left;">2</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">73.24</td>
<td style="text-align: center;">64.21</td>
</tr>
<tr class="odd">
<td style="text-align: left;">3</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">73.57</td>
<td style="text-align: center;">66.52</td>
</tr>
<tr class="even">
<td style="text-align: left;">4</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">74.69</td>
<td style="text-align: center;">66.90</td>
</tr>
<tr class="odd">
<td style="text-align: left;">5</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">-</td>
<td style="text-align: center;">70.21</td>
<td style="text-align: center;">65.52</td>
</tr>
<tr class="even">
<td style="text-align: left;">6</td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">✔️ </td>
<td style="text-align: center;">73.83</td>
<td style="text-align: center;"><strong>68.67</strong></td>
</tr>
</tbody>
</table>
<table class="ui small striped table">
<caption>Table 5: Accuracy of models attempting to pick the correct PI pair from a pool of correct + 10 negatives created by simple corruptions such as replacing or swapping noun phrases and switching directions (left with right). Random performance is <span class="math inline">1/11</span> or 9.1%.</caption>
<tbody>
<thead class="even">
<th style="text-align: center;"></th>
<th style="text-align: center;">Seen</th>
<th style="text-align: center;">Unseen</th>
<th style="text-align: center;">Seen</th>
<th style="text-align: center;">Unseen</th>
<th style="text-align: center;">Seen</th>
<th style="text-align: center;">Unseen</th>
</thead>
<tr class="odd">
<td style="text-align: center;">VLN-BERT</td>
<td style="text-align: center;">60.3</td>
<td style="text-align: center;">58.7</td>
<td style="text-align: center;">53.4</td>
<td style="text-align: center;">52.3</td>
<td style="text-align: center;">46.2</td>
<td style="text-align: center;">45.3</td>
</tr>
<tr class="even">
<td style="text-align: center;">Airbert</td>
<td style="text-align: center;">68.3</td>
<td style="text-align: center;">66.6</td>
<td style="text-align: center;">66.6</td>
<td style="text-align: center;">61.1</td>
<td style="text-align: center;">47.3</td>
<td style="text-align: center;">49.8</td>
</tr>
</tbody>
</table>
<table class="ui small striped table">
<caption>Table 8: Navigation performance on the R2R unseen test set as indicated on the benchmark leaderboard.</caption>
<tbody>
<thead class="odd">
<th style="text-align: left;">Model</th>
<th style="text-align: center;">OSR</th>
<th style="text-align: center;">SR</th>
</thead>
<tr class="odd">
<td style="text-align: left;">Speaker-Follower <span class="citation" data-cites="fried2018speaker">[27]</span></td>
<td style="text-align: center;">96</td>
<td style="text-align: center;">53</td>
</tr>
<tr class="even">
<td style="text-align: left;">PreSS <span class="citation" data-cites="li2019press">[16]</span></td>
<td style="text-align: center;">57</td>
<td style="text-align: center;">53</td>
</tr>
<tr class="odd">
<td style="text-align: left;">PREVALENT <span class="citation" data-cites="hao2020prevalent">[14]</span></td>
<td style="text-align: center;">64</td>
<td style="text-align: center;">59</td>
</tr>
<tr class="even">
<td style="text-align: left;">Self-Monitoring <span class="citation" data-cites="ma2019self">[28]</span></td>
<td style="text-align: center;">97</td>
<td style="text-align: center;">61</td>
</tr>
<tr class="odd">
<td style="text-align: left;">Reinforced CM <span class="citation" data-cites="wang2019reinforced">[31]</span></td>
<td style="text-align: center;">96</td>
<td style="text-align: center;">63</td>
</tr>
<tr class="even">
<td style="text-align: left;">EnvDrop <span class="citation" data-cites="anderson2018r2r">[2]</span></td>
<td style="text-align: center;">99</td>
<td style="text-align: center;">69</td>
</tr>
<tr class="odd">
<td style="text-align: left;">AuxRN <span class="citation" data-cites="zhu2020auxrn">[51]</span></td>
<td style="text-align: center;">81</td>
<td style="text-align: center;">71</td>
</tr>
<tr class="even">
<td style="text-align: left;">VLN-BERT <span class="citation" data-cites="majumdar2020vlnbert">[17]</span></td>
<td style="text-align: center;">99</td>
<td style="text-align: center;">73</td>
</tr>
<tr class="odd">
<td style="text-align: left;">Airbert (ours)</td>
<td style="text-align: center;">99</td>
<td style="text-align: center;">77</td>
</tr>
</tbody>
</table>
</div>
</div>
<p>We first perform ablation studies evaluating alternative ways to pretrain Airbert in Sec. <a href="#sec:xp_pretrain_airbert" data-reference-type="ref" data-reference="sec:xp_pretrain_airbert">5.1</a>. Then, we compare Airbert with state-of-the-art methods on R2R and REVERIE tasks in Sec. <a href="#sec:xp_sota" data-reference-type="ref" data-reference="sec:xp_sota">5.2</a>. Finally, in Sec. <a href="#sec:eval:fsl" data-reference-type="ref" data-reference="sec:eval:fsl">5.3</a>, we evaluate models in a more challenging setup: VLN few-shot learning where an agent is trained on examples taken from one/few houses.</p>
<p><strong>R2R Setup.</strong> We briefly describe the two evaluation datasets used in our work: R2R <span class="citation" data-cites="anderson2018r2r">[2]</span> and REVERIE <span class="citation" data-cites="qi2020reverie">[19]</span>. Most of our experiments are conducted on the R2R dataset <span class="citation" data-cites="anderson2018r2r">[2]</span>, where we adopt standard splits and metrics defined by the task. We focus on success rate (SR), which is the ratio of predicted paths that stop within 3m of the goal. Please refer to <span class="citation" data-cites="anderson2018r2r majumdar2020vlnbert">[2], [17]</span> for a more detailed explanation of the metrics. In particular, as the discriminative model uses path selection for R2R, we follow the pre-explored environment setting adopted by VLN-BERT <span class="citation" data-cites="majumdar2020vlnbert">[17]</span>, and compute metrics on the selected path.</p>
<p><strong>REVERIE Setup.</strong> We also adopt standard splits and metrics on the REVERIE task <span class="citation" data-cites="qi2020reverie">[19]</span>. Here, the success rate (SR) is the ratio of paths for which the agent stops at a viewpoint where the target object is visible. Remote Grounding Success Rate (RGS) measures accuracy of localizing the target object in the stopped viewpoint, and RGS per path length (RGSPL) is a path length weighted version.</p>
<h2 class="ui header">5.1 Pretraining with BnB</h2>
<p>We perform ablation studies on the impact of various methods for creating path-instruction pairs. We also present ablation studies that highlight the impact of using the shuffling loss during Airbert’s pretraining as well as fine-tuning stages. Throughout this section, our primary focus is on the SR on the unseen validation set and we compare our results against VLN-BERT <span class="citation" data-cites="majumdar2020vlnbert">[17]</span>, which achieves a SR of 59.26%.</p>
<p><strong>1. Impact of creating path-instruction pairs.</strong> Table <a href="#tab:how" data-reference-type="ref" data-reference="tab:how">2</a> presents the performance of multiple ways of using the BnB dataset after ConCaps pretraining as illustrated in Fig. <a href="#fig:model" data-reference-type="ref" data-reference="fig:model">3</a>. In row 1, we show that directly using BnB IC pairs without any strategies to reduce domain gap improves performance over VLN-BERT by 3.2%. Even if we skip ConCaps pretraining, we achieve 60.54% outperforming 59.26% of VLN-BERT. It proves that our BnB dataset is more beneficial to VLN than the generic ConCaps dataset.</p>
<p>Naive concatenation (row 2) does only slightly better than using the IC pairs (row 1) as there are still domain shifts with respect to fluency of transitions and lack of visual context. Rows 3-6 show that each method mitigates domain-shift to some extent. Instruction rephrasing (row 3) performs better at improving instructions than instruction generation (row 4), possibly since the generator is unable to use the diverse vocabulary of the BnB captions. Inserting captionless images at random locations (row 6) reduces the domain-shift significantly and achieves the highest individual performance. Finally, a combination of instruction rephrasing, image merging and captionless insertion provides an overall 3.8% improvement over concatenation, and a large 7.2% improvement over VLN-BERT.</p>
<p><strong>2. Shuffling loss applied during pretraining.</strong> Table <a href="#tab:shuffle" data-reference-type="ref" data-reference="tab:shuffle">3</a> demonstrates that shuffling is an effective strategy to train the model to reason about temporal order, and enforce alignment between PI pairs. Rows 2-4 show that shuffling is beneficial both during pretraining with BnB-PI data, or during fine-tuning with R2R data, and results in 2.3% and 0.4% improvements respectively. In combination with the <em>Speaker</em> dataset (paths from seen houses with generated instruction yielding 178K additional PI pairs <span class="citation" data-cites="tan2019envdrop">[32]</span>), we see that shuffling has a major role to play and provides 3.1% overall improvement (row 5 vs. 6). 68.67% is also our highest single-model performance on the R2R dataset.</p>
<p><strong>3. Shuffling loss applied during fine-tuning.</strong> The final stage of model training on R2R involves fine-tuning to rank multiple candidate paths that form the path selection task. We compare various approaches to improve this fine-tuning procedure (results in Table <a href="#tab:dataaug-r2r" data-reference-type="ref" data-reference="tab:dataaug-r2r">4</a>). (1) In row 2, we explore the impact of using additional negative paths. Unsurprisingly, this does not improve performance. (2) Inspired by <span class="citation" data-cites="gupta2020contrastive">[49]</span>, we highlight keywords in the instruction using a part-of-speech tagger <span class="citation" data-cites="joshi2018parser">[50]</span>, and include an extra loss term that encourages the model to pay attention to their similarity scores (row 3). (3) Another alternative suggested by <span class="citation" data-cites="gupta2020contrastive">[49]</span> involves masking keywords in the instruction and using VLP models to suggest replacements, resulting in hard negatives (row 4).</p>
<p>Hard negatives and highlighting keywords show good performance improvements, about 2.1-2.3%, but at the cost of extra parsers or VLP models. On the other hand, shuffling visual paths to create two additional negatives results in highest performance improvement (row 5, +2.7% on val unseen) and appears to be a strong strategy to enforce temporal order reasoning, that neither requires an external parser nor additional VLP models.</p>
<p><strong>4. Error analysis.</strong> We study the areas in which Airbert brings major improvements by analyzing scores for aligned PI pairs and simple corruptions that involve replacing noun phrases (<em>e.g</em>.. <em>bedroom</em> by <em>sofa</em>), swapping noun phrases appearing within the instruction, or switching left and right directions (<em>e.g</em>.. <em>turn left/right</em> or <em>leftmost/rightmost chair</em>). In particular, for every ground-truth aligned PI pair, we create 10 additional negatives by corrupting the instruction, and measure how well the model is able to assign the highest score to the correct pair as accuracy. Table <a href="#tab:analysis" data-reference-type="ref" data-reference="tab:analysis">3</a> shows that Airbert with in-domain training and the shuffling loss achieves large improvements (<span class="math inline">></span> 8%) for corruptions involving replacement or swapping of noun phrases. On the other hand, distinguishing directions continues to be a challenging problem; but here as well we see Airbert outperform VLN-BERT by 4.5%.</p>
<h2 class="ui header">5.2. Comparison against state-of-the-art</h2>
<table class="ui small striped table">
<tbody>
<thead class="odd">
<th style="text-align: center;">Model</th>
<th style="text-align: center;">SR</th>
<th style="text-align: center;">OSR</th>
<th style="text-align: center;">SPL</th>
<th style="text-align: center;">RGS</th>
<th style="text-align: center;">RGSPL</th>
</thead>
<tr class="even">
<td style="text-align: left;">Seq2Seq-SF <span class="citation" data-cites="anderson2018r2r">[2]</span></td>
<td style="text-align: center;">3.99</td>
<td style="text-align: center;">6.88</td>
<td style="text-align: center;">3.09</td>
<td style="text-align: center;">2.00</td>
<td style="text-align: center;">1.58</td>
</tr>
<tr class="odd">
<td style="text-align: left;">RCM <span class="citation" data-cites="wang2019reinforced">[31]</span></td>
<td style="text-align: center;">7.84</td>
<td style="text-align: center;">11.68</td>
<td style="text-align: center;">6.67</td>
<td style="text-align: center;">3.67</td>
<td style="text-align: center;">3.14</td>
</tr>
<tr class="even">
<td style="text-align: left;">SMNA <span class="citation" data-cites="ma2019self">[28]</span></td>
<td style="text-align: center;">5.80</td>
<td style="text-align: center;">8.39</td>
<td style="text-align: center;">4.53</td>
<td style="text-align: center;">3.10</td>
<td style="text-align: center;">2.39</td>
</tr>
<tr class="odd">
<td style="text-align: left;">FAST-MATTN <span class="citation" data-cites="qi2020reverie">[19]</span></td>
<td style="text-align: center;">19.88</td>
<td style="text-align: center;">30.63</td>
<td style="text-align: center;">11.61</td>
<td style="text-align: center;">11.28</td>
<td style="text-align: center;">6.08</td>
</tr>
<tr class="even">
<td style="text-align: left;">Rec (OSCAR) <span class="citation" data-cites="hong2021recurrentvln">[18]</span></td>
<td style="text-align: center;">22.14</td>