-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.html
1884 lines (1246 loc) · 656 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html>
<head><meta name="generator" content="Hexo 3.9.0">
<meta charset="utf-8">
<meta name="renderer" content="webkit">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<link rel="dns-prefetch" href="https://kkewwei.github.io/elasticsearch_learning">
<title>Hexo</title>
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<meta name="description" content="一个一直向阳的人">
<meta property="og:type" content="website">
<meta property="og:title" content="Hexo">
<meta property="og:url" content="https://kkewwei.github.io/elasticsearch_learning/index.html">
<meta property="og:site_name" content="Hexo">
<meta property="og:description" content="一个一直向阳的人">
<meta property="og:locale" content="default">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="Hexo">
<meta name="twitter:description" content="一个一直向阳的人">
<link rel="alternative" href="/atom.xml" title="Hexo" type="application/atom+xml">
<link rel="icon" href="/elasticsearch_learning/img/ico.png">
<link rel="stylesheet" type="text/css" href="/elasticsearch_learning/./main.0cf68a.css">
<style type="text/css">
#container.show {
background: linear-gradient(200deg,#a0cfe4,#e8c37e);
}
</style>
</head>
</html>
<body>
<div id="container" q-class="show:isCtnShow">
<canvas id="anm-canvas" class="anm-canvas"></canvas>
<div class="left-col" q-class="show:isShow">
<div class="overlay" style="background: #4d4d4d"></div>
<div class="intrude-less">
<header id="header" class="inner">
<a href="/elasticsearch_learning/" class="profilepic">
<img src="https://kkewwei.github.io/elasticsearch_learning/img/myself.png" class="js-avatar">
</a>
<hgroup>
<h1 class="header-author"><a href="/elasticsearch_learning/">jianguo</a></h1>
</hgroup>
<p class="header-subtitle">往事随风</p>
<nav class="header-menu">
<ul>
<li><a href="/elasticsearch_learning/">主页</a></li>
<li><a href="/elasticsearch_learning/tags/随笔/">随笔</a></li>
<li><a href="/elasticsearch_learning/categories">分类</a></li>
</ul>
</nav>
<nav class="header-smart-menu">
<a q-on="click: openSlider(e, 'innerArchive')" href="javascript:void(0)">所有文章</a>
<a q-on="click: openSlider(e, 'friends')" href="javascript:void(0)">友链</a>
<a q-on="click: openSlider(e, 'aboutme')" href="javascript:void(0)">关于我</a>
</nav>
<nav class="header-nav">
<div class="social">
<a class="github" target="_blank" href="#" title="github"><i class="icon-github"></i></a>
<a class="weibo" target="_blank" href="#" title="weibo"><i class="icon-weibo"></i></a>
<a class="rss" target="_blank" href="#" title="rss"><i class="icon-rss"></i></a>
<a class="mail" target="_blank" href="/elasticsearch_learning/[email protected]" title="mail"><i class="icon-mail"></i></a>
</div>
</nav>
</header>
</div>
</div>
<div class="mid-col" q-class="show:isShow,hide:isShow|isFalse">
<nav id="mobile-nav">
<div class="overlay js-overlay" style="background: #4d4d4d"></div>
<div class="btnctn js-mobile-btnctn">
<div class="slider-trigger list" q-on="click: openSlider(e)"><i class="icon icon-sort"></i></div>
</div>
<div class="intrude-less">
<header id="header" class="inner">
<div class="profilepic">
<img src="https://kkewwei.github.io/elasticsearch_learning/img/myself.png" class="js-avatar">
</div>
<hgroup>
<h1 class="header-author js-header-author">jianguo</h1>
</hgroup>
<p class="header-subtitle"><i class="icon icon-quo-left"></i>往事随风<i class="icon icon-quo-right"></i></p>
<nav class="header-nav">
<div class="social">
<a class="github" target="_blank" href="#" title="github"><i class="icon-github"></i></a>
<a class="weibo" target="_blank" href="#" title="weibo"><i class="icon-weibo"></i></a>
<a class="rss" target="_blank" href="#" title="rss"><i class="icon-rss"></i></a>
<a class="mail" target="_blank" href="/elasticsearch_learning/[email protected]" title="mail"><i class="icon-mail"></i></a>
</div>
</nav>
<nav class="header-menu js-header-menu">
<ul style="width: 70%">
<li style="width: 33.333333333333336%"><a href="/elasticsearch_learning/">主页</a></li>
<li style="width: 33.333333333333336%"><a href="/elasticsearch_learning/tags/随笔/">随笔</a></li>
<li style="width: 33.333333333333336%"><a href="/elasticsearch_learning/categories">分类</a></li>
</ul>
</nav>
</header>
</div>
<div class="mobile-mask" style="display:none" q-show="isShow"></div>
</nav>
<div id="wrapper" class="body-wrap">
<div class="menu-l">
<div class="canvas-wrap">
<canvas data-colors="#eaeaea" data-sectionHeight="100" data-contentId="js-content" id="myCanvas1" class="anm-canvas"></canvas>
</div>
<div id="js-content" class="content-ll">
<article id="post-HTTP异步Client源码解析" class="article article-type-post article-index" itemscope itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/elasticsearch_learning/2021/11/05/HTTP异步Client源码解析/">HTTP异步Client源码解析</a>
</h1>
<a href="/elasticsearch_learning/2021/11/05/HTTP异步Client源码解析/" class="archive-article-date">
<time datetime="2021-11-05T10:52:51.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2021-11-05</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
<p>我们知道Netty作为高性能通信框架,优点在于内部封装了管道的连接通信等操作,用户只需要调用封装好的接口,便可以很便捷的进行高并发通信。类似,在Http请求时,我们通过调用HttpClient,内部使用java NIO技术,通过引入连接池概念,来提高Http的并发能力,本文主要讲解该客户端内部是如何实现并发能力提高的原理。Http客户端分为同步和异步方式,以下示例展示了最基本的异步使用:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br></pre></td><td class="code"><pre><span class="line">RequestConfig.Builder requestConfigBuilder = RequestConfig.custom()</span><br><span class="line"> .setConnectTimeout(5000)</span><br><span class="line"> .setSocketTimeout(0)</span><br><span class="line"> .setConnectionRequestTimeout(3000);</span><br><span class="line">HttpAsyncClientBuilder httpClientBuilder = HttpAsyncClientBuilder.create().setDefaultRequestConfig(requestConfigBuilder.build())</span><br><span class="line">// 这些参数会用来生成PoolingNHttpClientConnectionManager,若PoolingNHttpClientConnectionManager自定义了,那么这些参数也就无效了</span><br><span class="line"> .setMaxConnPerRoute(10).setMaxConnTotal(30);</span><br><span class="line">//配置io线程</span><br><span class="line">IOReactorConfig ioReactorConfig = IOReactorConfig.custom().</span><br><span class="line"> setIoThreadCount(Runtime.getRuntime().availableProcessors())</span><br><span class="line"> .setSoKeepAlive(true)</span><br><span class="line"> .build();</span><br><span class="line">DefaultConnectingIOReactor ioReactor = new DefaultConnectingIOReactor(ioReactorConfig);</span><br><span class="line"></span><br><span class="line">ioReactor.setExceptionHandler(new IOReactorExceptionHandler() {</span><br><span class="line"> @Override</span><br><span class="line"> public boolean handle(IOException e) {</span><br><span class="line"> System.out.println("dsdsdsd");</span><br><span class="line"> return true;</span><br><span class="line"> }</span><br><span class="line"> @Override</span><br><span class="line"> public boolean handle(RuntimeException e) {</span><br><span class="line"> System.out.println("dsssd");</span><br><span class="line"> return true;</span><br><span class="line"></span><br><span class="line"> }</span><br><span class="line">});</span><br><span class="line">// 设置channel连接池并发参数</span><br><span class="line">PoolingNHttpClientConnectionManager poolingNHttpClientConnectionManager = new PoolingNHttpClientConnectionManager(ioReactor);</span><br><span class="line">poolingNHttpClientConnectionManager.setDefaultMaxPerRoute(5);</span><br><span class="line">poolingNHttpClientConnectionManager.setMaxTotal(80);</span><br><span class="line">httpClientBuilder.setConnectionManager(poolingNHttpClientConnectionManager);</span><br><span class="line"></span><br><span class="line"></span><br><span class="line">// 初始化Client并启动</span><br><span class="line">CloseableHttpAsyncClient client = HttpAsyncClients.custom().</span><br><span class="line"> setConnectionManager(poolingNHttpClientConnectionManager)</span><br><span class="line"> .build();</span><br><span class="line">client.start();</span><br><span class="line"></span><br><span class="line">final HttpGet request = new HttpGet("http://1.1.1.2:9200/indexName/_search");</span><br><span class="line"></span><br><span class="line">// 异步查询</span><br><span class="line">client.execute(request, new FutureCallback<HttpResponse>() {</span><br><span class="line"> @Override</span><br><span class="line"> public void completed(HttpResponse result){</span><br><span class="line"> try {</span><br><span class="line"></span><br><span class="line"> System.out.println(EntityUtils.toString(result.getEntity()));</span><br><span class="line"> } catch (Exception e) {</span><br><span class="line"> e.fillInStackTrace();</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> @Override</span><br><span class="line"> public void failed(Exception ex) {</span><br><span class="line"> ex.fillInStackTrace();</span><br><span class="line"> }</span><br><span class="line"> @Override</span><br><span class="line"> public void cancelled() {</span><br><span class="line"> System.out.println("cancelled");</span><br><span class="line"> }</span><br><span class="line">});</span><br><span class="line">Thread.sleep(10000);</span><br><span class="line">client.close();</span><br></pre></td></tr></table></figure>
<p>使用上没啥好说的,我们就直接以数据流流向为主线,看内部是如何使用连接池进行请求处理的。需要注意的是,若我们自定义了poolingNHttpClientConnectionManager对象,那么在requestConfigBuilder中设置的连接并发将不生效。</p>
<h1 id="客户端内部初始化"><a href="#客户端内部初始化" class="headerlink" title="客户端内部初始化"></a>客户端内部初始化</h1><p>pom文件</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br></pre></td><td class="code"><pre><span class="line"><dependency></span><br><span class="line"> <groupId>org.apache.httpcomponents</groupId></span><br><span class="line"> <artifactId>httpcore</artifactId></span><br><span class="line"> <version>4.4.12</version></span><br><span class="line"></dependency></span><br><span class="line"><dependency></span><br><span class="line"> <groupId>org.apache.httpcomponents</groupId></span><br><span class="line"> <artifactId>httpclient</artifactId></span><br><span class="line"> <version>4.5.10</version></span><br><span class="line"></dependency></span><br><span class="line"><dependency></span><br><span class="line"> <groupId>org.apache.httpcomponents</groupId></span><br><span class="line"> <artifactId>httpcore-nio</artifactId></span><br><span class="line"> <version>4.4.12</version></span><br><span class="line"></dependency></span><br><span class="line"></span><br><span class="line"><dependency></span><br><span class="line"> <groupId>org.apache.httpcomponents</groupId></span><br><span class="line"> <artifactId>httpasyncclient</artifactId></span><br><span class="line"> <version>4.1.4</version></span><br><span class="line"></dependency></span><br></pre></td></tr></table></figure>
<p>目前httpclient已经升级到了5.x,本文源码基于4.X</p>
<h2 id="InternalHttpAsyncClient客户端"><a href="#InternalHttpAsyncClient客户端" class="headerlink" title="InternalHttpAsyncClient客户端"></a>InternalHttpAsyncClient客户端</h2><p>我们需要关注下<code>InternalHttpAsyncClient</code>及其基类<code>CloseableHttpAsyncClientBase</code>,这里把重要的属性都罗列出来:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line">// 线程池管理者</span><br><span class="line">private final NHttpClientConnectionManager connmgr; </span><br><span class="line">//MainClientExec 请求发送接收时的处理 </span><br><span class="line">private final InternalClientExec exec;</span><br><span class="line">// 类似netty的boss线程,负责管道建立连接</span><br><span class="line">private final Thread reactorThread;</span><br></pre></td></tr></table></figure>
<p>下图是客户端初始化时创建的一些重要的对象:<br><img src="https://kkewwei.github.io/elasticsearch_learning/img/httpasycnclient1.png" height="400" width="1300"><br>PoolingNHttpClientConnectionManager:根据名称就可以看到,是连接池管理者。<br>CPool:连接池,存放了当前连接池的连接信息,比如全局空闲连接available、每个route独自的Pool,后面会详细介绍。</p>
<h2 id="连接建立线程-请求处理线程"><a href="#连接建立线程-请求处理线程" class="headerlink" title="连接建立线程+请求处理线程"></a>连接建立线程+请求处理线程</h2><p>客户端内部会创建两类线程,类似netty的boss和worker线程,分别用来创建连接管道:AbstractMultiworkerIOReactor、以及请求发送线程:BaseIOReactor。本文中,也复用netty的称呼,分别将这两类线程称呼为boss线程和worker线程。boss线程在CloseableHttpAsyncClientBase构造函数初始化时初始化:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br></pre></td><td class="code"><pre><span class="line">if (threadFactory != null && handler != null) {</span><br><span class="line"> this.reactorThread = threadFactory.newThread(new Runnable() { </span><br><span class="line"></span><br><span class="line"> @Override</span><br><span class="line"> public void run() {</span><br><span class="line"> try {</span><br><span class="line"> // 比如当线程接收到数据,就跑到IOEventDispatch里面了</span><br><span class="line"> final IOEventDispatch ioEventDispatch = new InternalIODispatch(handler);</span><br><span class="line"> // 将跑到PoolingNHttpClientConnectionManager.execute()</span><br><span class="line"> connmgr.execute(ioEventDispatch);</span><br><span class="line"> } catch (final Exception ex) {</span><br><span class="line"> log.error("I/O reactor terminated abnormally", ex);</span><br><span class="line"> } finally {</span><br><span class="line"> status.set(Status.STOPPED);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> });</span><br><span class="line">} else {</span><br><span class="line"> this.reactorThread = null;</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>boss线程真正工作的地方是在AbstractMultiworkerIOReactor,我们需要注意的是selector选择器(会在AbstractMultiworkerIOReactor构造时产生),每当需要构建管道时,都会向该selector上注册OP_CONNECT事件。AbstractMultiworkerIOReactor初始化代码如下:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br></pre></td><td class="code"><pre><span class="line">public void execute(//eventDispatch=InternalIODispatch</span><br><span class="line"> final IOEventDispatch eventDispatch) throws InterruptedIOException, IOReactorException {</span><br><span class="line"> synchronized (this.statusLock) {</span><br><span class="line"> this.status = IOReactorStatus.ACTIVE; </span><br><span class="line"> // Start I/O dispatchers</span><br><span class="line"> for (int i = 0; i < this.dispatchers.length; i++) {</span><br><span class="line"> final BaseIOReactor dispatcher = new BaseIOReactor(this.selectTimeout, this.interestOpsQueueing);</span><br><span class="line"> dispatcher.setExceptionHandler(exceptionHandler);</span><br><span class="line"> this.dispatchers[i] = dispatcher;</span><br><span class="line"> }</span><br><span class="line"> for (int i = 0; i < this.workerCount; i++) {</span><br><span class="line"> final BaseIOReactor dispatcher = this.dispatchers[i];</span><br><span class="line"> this.workers[i] = new Worker(dispatcher, eventDispatch);</span><br><span class="line"> // 产生的线程名称都是"I/O dispatcher 120"</span><br><span class="line"> this.threads[i] = this.threadFactory.newThread(this.workers[i]);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> try {</span><br><span class="line"> // I/O dispatcher开头的线程名称</span><br><span class="line"> for (int i = 0; i < this.workerCount; i++) {</span><br><span class="line"> if (this.status != IOReactorStatus.ACTIVE) {</span><br><span class="line"> return;</span><br><span class="line"> }</span><br><span class="line"> this.threads[i].start();</span><br><span class="line"> }</span><br><span class="line"> // 无线死循环了,除非管道关闭</span><br><span class="line"> for (;;) { </span><br><span class="line"> final int readyCount;</span><br><span class="line"> try {</span><br><span class="line"> // 默认睡眠1s</span><br><span class="line"> readyCount = this.selector.select(this.selectTimeout);</span><br><span class="line"> } catch (final InterruptedIOException ex) {</span><br><span class="line"> throw ex;</span><br><span class="line"> } catch (final IOException ex) {</span><br><span class="line"> throw new IOReactorException("Unexpected selector failure", ex);</span><br><span class="line"> }</span><br><span class="line"> // 如果有需要处理的事件, 则进入processEvents流程, 实际的连接过程就在这里</span><br><span class="line"> if (this.status.compareTo(IOReactorStatus.ACTIVE) == 0) {</span><br><span class="line"> // 纯粹管连接的地方</span><br><span class="line"> processEvents(readyCount);</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> // Verify I/O dispatchers</span><br><span class="line"> for (int i = 0; i < this.workerCount; i++) {</span><br><span class="line"> final Worker worker = this.workers[i];</span><br><span class="line"> final Throwable ex = worker.getThrowable();</span><br><span class="line"> if (ex != null) {</span><br><span class="line"> throw new IOReactorException(</span><br><span class="line"> "I/O dispatch worker terminated abnormally", ex);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> } finally {</span><br><span class="line"> doShutdown();</span><br><span class="line"> synchronized (this.statusLock) {</span><br><span class="line"> this.status = IOReactorStatus.SHUT_DOWN;</span><br><span class="line"> this.statusLock.notifyAll();</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>具体做了如下事情:<br>1.构建n个worker线程,线程名称是<code>I/O dispatcher n</code>开头的, n可以在<code>IOReactorConfig</code>初始化时设置,默认为cpu的个数。<br>2.启动n个worker线程,每个worker线程真正工作时会跑到<code>BaseIOReactor.execute()</code>中的。<br>3.死循环:<code>select(selectTimeout)</code>,监听管道建立事件发生,并调用<code>processEvents</code>进行管道建立的操作,随机选择一个woker线程,将管道及请求塞入对应的<code>newChannels</code>中,后面会再次介绍。每当有新管道需要创建时,会自动调用selector.wakeup()函数。</p>
<p>我们再看下worker线程内初始化时构建了哪些对象:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br></pre></td><td class="code"><pre><span class="line">public AbstractIOReactor(final long selectTimeout, final boolean interestOpsQueueing) throws IOReactorException {</span><br><span class="line"> super();</span><br><span class="line"> // 每个worker线程默认睡眠selectTimeout,然后从select(selectTimeout)醒来检查</span><br><span class="line"> this.selectTimeout = selectTimeout;</span><br><span class="line"> // 该worker管理的所有IOSessionImpl</span><br><span class="line"> this.sessions = Collections.synchronizedSet(new HashSet<IOSession>());</span><br><span class="line"> // 该worker接受的从boss线程建立好管道,而需要进行数据尕怂的请求体</span><br><span class="line"> this.newChannels = new ConcurrentLinkedQueue<ChannelEntry>();</span><br><span class="line"> try {</span><br><span class="line"> // 每个worker都会拥有一个selector,用来监听读写请求。</span><br><span class="line"> this.selector = Selector.open();</span><br><span class="line"> } catch (final IOException ex) {</span><br><span class="line"> throw new IOReactorException("Failure opening selector", ex);</span><br><span class="line"> }</span><br><span class="line"> this.statusMutex = new Object();</span><br><span class="line"> this.status = IOReactorStatus.INACTIVE;</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>我们需要知道的是:<br>1.每个woker线程也拥有一个selector。<br>2.当boss新建管道后,将管道及请求随机放入worker线程newChannels中,后续工作由worker进行。<br>我们再看下worker线程一直在忙哪些操作:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br></pre></td><td class="code"><pre><span class="line">protected void execute() throws InterruptedIOException, IOReactorException {</span><br><span class="line"> this.status = IOReactorStatus.ACTIVE;</span><br><span class="line"> try {</span><br><span class="line"> for (;;) { // 这里也是无线死循环了</span><br><span class="line"> final int readyCount;</span><br><span class="line"> try {</span><br><span class="line"> readyCount = this.selector.select(this.selectTimeout); // 查询到影响,最多1s</span><br><span class="line"> }</span><br><span class="line"> // Process selected I/O events</span><br><span class="line"> if (readyCount > 0) { // 处理IO 事件</span><br><span class="line"> processEvents(this.selector.selectedKeys());</span><br><span class="line"> }</span><br><span class="line"> // Validate active channels</span><br><span class="line"> //调用AbstractIOReactor.timeoutCheck()检查这个管道对应的请求是否超时。</span><br><span class="line"> //超时了会打印milliseconds timeout on connection http-outgoing-日志</span><br><span class="line"> validate(this.selector.keys()); </span><br><span class="line"> // Process closed sessions</span><br><span class="line"> processClosedSessions();</span><br><span class="line"> // If active process new channels</span><br><span class="line"> if (this.status == IOReactorStatus.ACTIVE) {</span><br><span class="line"> processNewChannels();</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> }</span><br><span class="line"> } finally {</span><br><span class="line"> hardShutdown();</span><br><span class="line"> synchronized (this.statusMutex) {</span><br><span class="line"> this.statusMutex.notifyAll();</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>worker主线程做了如下事情:<br>1.进行select()等待,最多等待selectTimeout。<br>2.若selector监听到事件产生后,会调用<code>processEvents()</code>进行处理,worker线程只会处理write和read事件,其余事件忽略不处理。<br>3.调用<code>validate</code>检查管道对应的请求是否超时了,超时会打印<code>milliseconds timeout on connection</code>类似的日志,相当于每个http请求增加了执行超时时间。这里的超时通过<code>setSocketTimeout</code>设置,若我们不需要设置http级别的超时时间,将该参数设置为0即可。<br>4.调用<code>processNewChannels</code>检查是否有boss线程传递过来新建立的管道,有的话,就处理,后面会介绍。</p>
<h1 id="http请求发送阶段"><a href="#http请求发送阶段" class="headerlink" title="http请求发送阶段"></a>http请求发送阶段</h1><h2 id="主线程申请请求发送"><a href="#主线程申请请求发送" class="headerlink" title="主线程申请请求发送"></a>主线程申请请求发送</h2><p>我们就直接以<code>InternalHttpAsyncClient.execute</code>代码开始,会首先构建<code>new DefaultClientExchangeHandlerImpl().start()</code>, 我们尤其需要注意<code>DefaultClientExchangeHandlerImpl</code>对象,存放着当前请求内容,当申请到管道后,会存放入管道的<code>http.nio.exchange-handler</code>属性中。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br></pre></td><td class="code"><pre><span class="line">public DefaultClientExchangeHandlerImpl(</span><br><span class="line"> final Log log,</span><br><span class="line"> final HttpAsyncRequestProducer requestProducer,</span><br><span class="line"> final HttpAsyncResponseConsumer<T> responseConsumer,//缓存Response的</span><br><span class="line"> final HttpClientContext localContext,</span><br><span class="line"> final BasicFuture<T> resultFuture,</span><br><span class="line"> final NHttpClientConnectionManager connmgr,</span><br><span class="line"> final ConnectionReuseStrategy connReuseStrategy,</span><br><span class="line"> final ConnectionKeepAliveStrategy keepaliveStrategy,</span><br><span class="line"> final InternalClientExec exec) { </span><br><span class="line"> // 1.基类会针对每次请求,产生一个id </span><br><span class="line"> // 2.我们需要注意localContext,可以存放请求的很多私有属性,比如</span><br><span class="line"> super(log, localContext, connmgr, connReuseStrategy, keepaliveStrategy);</span><br><span class="line"> // 请求产生者</span><br><span class="line"> this.requestProducer = requestProducer;</span><br><span class="line"> // response存储地方</span><br><span class="line"> this.responseConsumer = responseConsumer;</span><br><span class="line"> // 响应用户请求</span><br><span class="line"> this.resultFuture = resultFuture;</span><br><span class="line"> this.exec = exec;</span><br><span class="line"> // 每次查询都包含一个state,</span><br><span class="line"> this.state = new InternalState(getId(), requestProducer, responseConsumer, localContext);// 产生当前的state</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>最终会调用<code>AbstractClientExchangeHandler.requestConnection()</code> -> <code>PoolingNHttpClientConnectionManager.requestConnection()</code> -> <code>AbstractNIOConnPool.lease()</code></p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br></pre></td><td class="code"><pre><span class="line">public Future<E> lease(</span><br><span class="line"> final T route, final Object state,</span><br><span class="line"> final long connectTimeout, final long leaseTimeout, final TimeUnit timeUnit,</span><br><span class="line"> final FutureCallback<E> callback) {</span><br><span class="line"> final BasicFuture<E> future = new BasicFuture<E>(callback);</span><br><span class="line"> final LeaseRequest<T, C, E> leaseRequest = new LeaseRequest<T, C, E>(route, state,</span><br><span class="line"> connectTimeout >= 0 ? timeUnit.toMillis(connectTimeout) : -1,</span><br><span class="line"> leaseTimeout > 0 ? timeUnit.toMillis(leaseTimeout) : 0,// connectionRequestTimeout,</span><br><span class="line"> future);</span><br><span class="line"> // 保证一次只能有一个获取,放在pending中占位并发</span><br><span class="line"> this.lock.lock(); </span><br><span class="line"> try {</span><br><span class="line"> final boolean completed = processPendingRequest(leaseRequest); </span><br><span class="line"> if (!leaseRequest.isDone() && !completed) {</span><br><span class="line"> this.leasingRequests.add(leaseRequest);</span><br><span class="line"> }</span><br><span class="line"> if (leaseRequest.isDone()) {</span><br><span class="line"> this.completedRequests.add(leaseRequest);</span><br><span class="line"> }</span><br><span class="line"> } finally {</span><br><span class="line"> this.lock.unlock(); </span><br><span class="line"> }</span><br><span class="line"> fireCallbacks();</span><br><span class="line"> ......</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>该函数主要目的是从是连接池中申请连接:<br>1.首先调用<code>this.lock.lock()</code>锁住线程池。这里需要说明下,在实际使用时,若并发相对较高时,发现存在严重的锁阻塞,阻塞耗时1-3s, 在httpclint5.x版本里,已经将线程池级别锁粒度细分到单个route粒度的锁,大大降低了锁互斥的等待时间。<br>2.调用<code>processPendingRequest</code>检查是否有空闲可用管道、可申请连接、还是请求需要pending。</p>
<ul>
<li>若返回为false, 且leaseRequest不为done, 说明连接池满了,将请求放入leasingRequests挂起,等待后续再次申请。</li>
<li>若返回为true, 且leaseRequest为done, 则说明申请到可复用的连接管道,请求则放入completedRequests,等待调用<code>fireCallbacks()</code>时交给worker线程。</li>
<li>若返回为true,且leaseRequest不为done, 则说明该route的连接并发未达上限,请求已经在<code>processPendingRequest</code>内放入了<code>DefaultConnectingIOReactor.requestQueue</code>,等待boss线程去创建新的管道。</li>
</ul>
<p>在继续后面的介绍前,先给大家介绍下线程池内部结构:<br><img src="https://kkewwei.github.io/elasticsearch_learning/img/httpasycnclient2.png" height="450" width="700"></p>
<ul>
<li>leasingRequests: 存放当前route连接并发已经达到上限的请求。</li>
<li>available: 完成请求后,会将当前管道释放到入available,等待后续请求直接复用该管道。</li>
<li>pending: pending中存放的是已经获取权限,需要自己构建SocketChannel的请求。直到构建管道<code>ManagedNHttpClientConnectionImpl</code>(此时已经建立了SocketChannel),才会将请求从pending转移到leased中。</li>
<li>leased: 既包含直接从available获取到可用连接管道的请求,也包含创建ManagedNHttpClientConnectionImpl后,从pending转移过来的请求,直到请求完成后将管道释放到available中。</li>
<li>completedRequests: 直接从连接池中拿到ManagedNHttpClientConnectionImpl,等待放入worker线程池的请求。</li>
<li>service1: 远程服务器,每个service1就是一个ip, 也就是一个route。<br>内部请求转化流程如下:<img src="https://kkewwei.github.io/elasticsearch_learning/img/httpasycnclient3.png" height="350" width="1000">
这里对涉及的管道的包含关系如下:
<img src="https://kkewwei.github.io/elasticsearch_learning/img/httpasycnclient4.png" height="300" width="1200"></li>
</ul>
<p>我们具体看下<code>processPendingRequest</code>是如何从连接池中申请管道的。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br><span class="line">80</span><br><span class="line">81</span><br><span class="line">82</span><br><span class="line">83</span><br><span class="line">84</span><br><span class="line">85</span><br><span class="line">86</span><br><span class="line">87</span><br><span class="line">88</span><br><span class="line">89</span><br><span class="line">90</span><br><span class="line">91</span><br><span class="line">92</span><br><span class="line">93</span><br><span class="line">94</span><br><span class="line">95</span><br><span class="line">96</span><br><span class="line">97</span><br><span class="line">98</span><br><span class="line">99</span><br><span class="line">100</span><br><span class="line">101</span><br><span class="line">102</span><br><span class="line">103</span><br><span class="line">104</span><br><span class="line">105</span><br><span class="line">106</span><br><span class="line">107</span><br><span class="line">108</span><br><span class="line">109</span><br></pre></td><td class="code"><pre><span class="line">private boolean processPendingRequest(final LeaseRequest<T, C, E> request) {</span><br><span class="line"> final T route = request.getRoute();</span><br><span class="line"> final long now = System.currentTimeMillis();</span><br><span class="line"> // 检查获取锁是否已经超时了</span><br><span class="line"> if (now > deadline) { </span><br><span class="line"> request.failed(new TimeoutException("Connection lease request time out"));</span><br><span class="line"> return false;</span><br><span class="line"> }</span><br><span class="line"> // 这个route对应的连接Pool</span><br><span class="line"> final RouteSpecificPool<T, C, E> pool = getPool(route);</span><br><span class="line"> E entry;</span><br><span class="line"> for (;;) {</span><br><span class="line"> // 首先从free中获取</span><br><span class="line"> entry = pool.getFree(state); </span><br><span class="line"> // 没有空闲的</span><br><span class="line"> if (entry == null) { </span><br><span class="line"> break;</span><br><span class="line"> } // 从free中获取到了</span><br><span class="line"> if (entry.isClosed() || entry.isExpired(System.currentTimeMillis())) {</span><br><span class="line"> entry.close();</span><br><span class="line"> this.available.remove(entry);</span><br><span class="line"> // 那么直接释放了 </span><br><span class="line"> pool.free(entry, false);</span><br><span class="line"> } else {</span><br><span class="line"> break;</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> // 从空闲队列申请到了CPoolEntry</span><br><span class="line"> if (entry != null) { </span><br><span class="line"> this.available.remove(entry);</span><br><span class="line"> // 转移到全局申请出去的列表中</span><br><span class="line"> this.leased.add(entry); </span><br><span class="line"> // 标记完成了</span><br><span class="line"> request.completed(entry); </span><br><span class="line"> // 啥都不做</span><br><span class="line"> onReuse(entry);</span><br><span class="line"> onLease(entry);</span><br><span class="line"> // 直接return了</span><br><span class="line"> return true;</span><br><span class="line"> }</span><br><span class="line"> // 没有空闲可用</span><br><span class="line"> // New connection is needed</span><br><span class="line"> final int maxPerRoute = getMax(route);</span><br><span class="line"> // Shrink the pool prior to allocating a new connection</span><br><span class="line"> final int excess = Math.max(0, pool.getAllocatedCount() + 1 - maxPerRoute);</span><br><span class="line"> // 仅仅是为了检查已经生成的的队列是否超过当前route限制,若超过了,就需要主动关闭了</span><br><span class="line"> if (excess > 0) { </span><br><span class="line"> // 超过了就开始从空闲队列中关闭</span><br><span class="line"> for (int i = 0; i < excess; i++) {</span><br><span class="line"> // 从空闲列表中拿个</span><br><span class="line"> final E lastUsed = pool.getLastUsed();</span><br><span class="line"> if (lastUsed == null) {</span><br><span class="line"> break;</span><br><span class="line"> }</span><br><span class="line"> lastUsed.close();</span><br><span class="line"> this.available.remove(lastUsed);</span><br><span class="line"> pool.remove(lastUsed); // 从本管道中关闭</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> // 该route若还没超过本routing身线程池</span><br><span class="line"> if (pool.getAllocatedCount() < maxPerRoute) </span><br><span class="line"> // 总池子的使用</span><br><span class="line"> final int totalUsed = this.pending.size() + this.leased.size();</span><br><span class="line"> // 当前申请的是否已经超过了总连接池个数</span><br><span class="line"> final int freeCapacity = Math.max(this.maxTotal - totalUsed, 0);</span><br><span class="line"> // 满了</span><br><span class="line"> if (freeCapacity == 0) { </span><br><span class="line"> return false;</span><br><span class="line"> }</span><br><span class="line"> // 此时还没超过</span><br><span class="line"> // 查看全局空闲是否超过限制了</span><br><span class="line"> final int totalAvailable = this.available.size();</span><br><span class="line"> // 当前空闲的+1是否超过了全局可用剩余个数 </span><br><span class="line"> if (totalAvailable > freeCapacity - 1) {</span><br><span class="line"> // 若超过了,那么就关闭一个 </span><br><span class="line"> if (!this.available.isEmpty()) {</span><br><span class="line"> final E lastUsed = this.available.removeLast();// CPoolEntry</span><br><span class="line"> lastUsed.close();</span><br><span class="line"> final RouteSpecificPool<T, C, E> otherpool = getPool(lastUsed.getRoute());</span><br><span class="line"> otherpool.remove(lastUsed);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> final SocketAddress localAddress;</span><br><span class="line"> final SocketAddress remoteAddress;</span><br><span class="line"> try {</span><br><span class="line"> //会把域名映射出来,比如:qa1.l1c.data.hehe.com映射5个ips, admin.daxe1.l1c.data.hehe.com映射一个vip,但是只取第一个</span><br><span class="line"> remoteAddress = this.addressResolver.resolveRemoteAddress(route);</span><br><span class="line"> localAddress = this.addressResolver.resolveLocalAddress(route);</span><br><span class="line"> } catch (final IOException ex) {</span><br><span class="line"> request.failed(ex);</span><br><span class="line"> return false;</span><br><span class="line"> }</span><br><span class="line"> // 将请求放入了请求Queue中,并唤醒了主selector。</span><br><span class="line"> final SessionRequest sessionRequest = this.ioReactor.connect(</span><br><span class="line"> remoteAddress, localAddress, route, this.sessionRequestCallback);</span><br><span class="line"> request.attachSessionRequest(sessionRequest);</span><br><span class="line"> final long connectTimeout = request.getConnectTimeout();</span><br><span class="line"> if (connectTimeout >= 0) {</span><br><span class="line"> sessionRequest.setConnectTimeout(connectTimeout < Integer.MAX_VALUE ? (int) connectTimeout : Integer.MAX_VALUE);</span><br><span class="line"> }</span><br><span class="line"> // 加入到route连接池pending集合</span><br><span class="line"> this.pending.add(sessionRequest);</span><br><span class="line"> // 已经获得了连接权,但是还没有建立连接的请求</span><br><span class="line"> pool.addPending(sessionRequest, request.getFuture());</span><br><span class="line"> return true;</span><br><span class="line"> }</span><br><span class="line"> return false;</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>向连接池申请连接主要做了如下事情:<br>1.检查获取lock是否超时,超时参数通过<code>setConnectionRequestTimeout(3000)</code>参数设置。<br>2.获取该route对应的连接池:RouteSpecificPool, 检测是否有空闲可用的连接,有的话就返回CPoolEntry。<br>3.检查当前route连接是否超过上限,有的话,就从availabe中取出,并关闭管道。<br>4.若连接还没达到上限,那么就调用<code>ioReactor.connect()</code>将请求放入DefaultConnectingIOReactor.requestQueue中,并唤醒主线程,等待boss线程去创建新的管道。</p>
<h2 id="boss线程创建新的连接管道"><a href="#boss线程创建新的连接管道" class="headerlink" title="boss线程创建新的连接管道"></a>boss线程创建新的连接管道</h2><p>前面也提到了,boss线程会在selector.select()中唤醒。唤醒后,会进入<code>DefaultConnectingIOReactor.processEvents</code>判断是否有需要建立连接的请求。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br></pre></td><td class="code"><pre><span class="line">protected void processEvents(final int readyCount) throws IOReactorException {</span><br><span class="line"> // 创建新的管道</span><br><span class="line"> processSessionRequests();</span><br><span class="line"> if (readyCount > 0) {</span><br><span class="line"> final Set<SelectionKey> selectedKeys = this.selector.selectedKeys();</span><br><span class="line"> for (final SelectionKey key : selectedKeys) { </span><br><span class="line"> // 发现有连接事件发生了</span><br><span class="line"> processEvent(key);</span><br><span class="line"> }</span><br><span class="line"> selectedKeys.clear();</span><br><span class="line"> }</span><br><span class="line"> // 判断select是否超时(默认1s)</span><br><span class="line"> final long currentTime = System.currentTimeMillis();</span><br><span class="line"> if ((currentTime - this.lastTimeoutCheck) >= this.selectTimeout) {</span><br><span class="line"> this.lastTimeoutCheck = currentTime;</span><br><span class="line"> final Set<SelectionKey> keys = this.selector.keys();</span><br><span class="line"> processTimeouts(keys);</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>boss每次循环主要做了如下事情:<br>1.调用<code>processSessionRequests</code>创建新的管道。<br>2.调用<code>processEvent()</code>处理发生的连接事件。<br>注意<code>DefaultConnectingIOReactor.processSessionRequests</code>只负责调用接口创建管道,而不用等待管道是否创建ok;而<code>processEvent</code>是专门用来监听管道是否建立成功的。我们继续看下创建管道做了哪些事情:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br></pre></td><td class="code"><pre><span class="line">private void processSessionRequests() throws IOReactorException {</span><br><span class="line"> SessionRequestImpl request;</span><br><span class="line"> // 有权产生新的管道,但是还没有管道可用</span><br><span class="line"> while ((request = this.requestQueue.poll()) != null) {</span><br><span class="line"> // 检查是否完成了 </span><br><span class="line"> if (request.isCompleted()) { </span><br><span class="line"> continue;</span><br><span class="line"> }</span><br><span class="line"> final SocketChannel socketChannel; // SocketChannelImpl</span><br><span class="line"> try { // 建立一个socket</span><br><span class="line"> socketChannel = SocketChannel.open();</span><br><span class="line"> } catch (final IOException ex) {</span><br><span class="line"> request.failed(ex);</span><br><span class="line"> return;</span><br><span class="line"> }</span><br><span class="line"> try {</span><br><span class="line"> validateAddress(request.getLocalAddress());</span><br><span class="line"> validateAddress(request.getRemoteAddress());</span><br><span class="line"> // 设置非阻塞</span><br><span class="line"> socketChannel.configureBlocking(false);</span><br><span class="line"> // 设置SocketAdaptor一些参数,比如是否复用,连接超时,写内核buffer</span><br><span class="line"> prepareSocket(socketChannel.socket());</span><br><span class="line"></span><br><span class="line"> if (request.getLocalAddress() != null) { // 为null</span><br><span class="line"> final Socket sock = socketChannel.socket();</span><br><span class="line"> sock.setReuseAddress(this.config.isSoReuseAddress());</span><br><span class="line"> sock.bind(request.getLocalAddress());</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> final SocketAddress targetAddress = request.getRemoteAddress();</span><br><span class="line"> // Run this under a doPrivileged to support lib users that run under a SecurityManager this allows granting connect</span><br><span class="line"> // permissions only to this library</span><br><span class="line"> // 是否已经连接上</span><br><span class="line"> final boolean connected; </span><br><span class="line"> try {</span><br><span class="line"> connected = AccessController.doPrivileged(</span><br><span class="line"> new PrivilegedExceptionAction<Boolean>() {</span><br><span class="line"> @Override</span><br><span class="line"> public Boolean run() throws IOException {</span><br><span class="line"> // 连接远程目标节点</span><br><span class="line"> return socketChannel.connect(targetAddress);</span><br><span class="line"> };</span><br><span class="line"> });</span><br><span class="line"> }</span><br><span class="line"> // 如果已经建立连接</span><br><span class="line"> if (connected) {</span><br><span class="line"> final ChannelEntry entry = new ChannelEntry(socketChannel, request);</span><br><span class="line"> ;// 直接就分配对对应的work了,就没boss线程啥事了</span><br><span class="line"> addChannel(entry)</span><br><span class="line"> continue;</span><br><span class="line"> }</span><br><span class="line"> } </span><br><span class="line"> // 还未连接成功, 则注册到selector, 等待connect事件的触发, 再用processEvent来处理</span><br><span class="line"> final SessionRequestHandle requestHandle = new SessionRequestHandle(request);</span><br><span class="line"> try {</span><br><span class="line"> // 向这个管道注册connect事件</span><br><span class="line"> final SelectionKey key = socketChannel.register(this.selector, SelectionKey.OP_CONNECT, requestHandle);</span><br><span class="line"> request.setKey(key);</span><br><span class="line"> } </span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>创建管道过程也相对比较清晰:<br>1.循环从DefaultConnectingIOReactor.requestQueue拿需要创建管道的请求。(前面提了,主线程会将创建管道的请求放入该queue中)<br>2.创建SocketChannelImpl后,调用bind绑定:</p>
<ul>
<li>若同步绑定成功后,将产生的ChannelEntry(socketChannel, request)顺序分配给一个worker线程(该worker的<code>newChannels</code>中)</li>
<li>若还未绑定成功,则向boss的selector添加SelectionKey.OP_CONNECT事件,等待管道连接的事件发送。(只有ServerSocketChannel才会注册SelectionKey.OP_ACCEPT事件, SocketChannel只能注册SelectionKey.OP_CONNECT事件)</li>
</ul>
<p>我们看下<code>processEvent</code>如何处理OP_CONNECT事件的。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br></pre></td><td class="code"><pre><span class="line">private void processEvent(final SelectionKey key) {</span><br><span class="line"> try {</span><br><span class="line"> // 该key是否是connect属性</span><br><span class="line"> if (key.isConnectable()) {</span><br><span class="line"> final SocketChannel channel = (SocketChannel) key.channel();</span><br><span class="line"> // Get request handle</span><br><span class="line"> final SessionRequestHandle requestHandle = (SessionRequestHandle) key.attachment();</span><br><span class="line"> final SessionRequestImpl sessionRequest = requestHandle.getSessionRequest();</span><br><span class="line"></span><br><span class="line"> // Finish connection process</span><br><span class="line"> try {</span><br><span class="line"> // 非阻塞模式下,确认是否连接好,若未连接好,直接返回false,方法必不可少(置位管道状态)</span><br><span class="line"> channel.finishConnect();</span><br><span class="line"> } catch (final IOException ex) {</span><br><span class="line"> sessionRequest.failed(ex);</span><br><span class="line"> }</span><br><span class="line"> key.cancel();</span><br><span class="line"> key.attach(null);</span><br><span class="line"> if (!sessionRequest.isCompleted()) {</span><br><span class="line"> addChannel(new ChannelEntry(channel, sessionRequest));</span><br><span class="line"> } else {</span><br><span class="line"> try {</span><br><span class="line"> channel.close();</span><br><span class="line"> } catch (final IOException ignore) {</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>一看就比较清晰了吧,boss线程只接受连接事件,非连接事件一律丢弃。检查到连接创建完成后,构建new ChannelEntry(channel, sessionRequest)顺序分配给一个worker线程(该worker的<code>newChannels</code>中)</p>
<h2 id="worker线程发送请求"><a href="#worker线程发送请求" class="headerlink" title="worker线程发送请求"></a>worker线程发送请求</h2><p>接下来就看worker线程接到请求后如何处理了。前面worker也提到了,worker死循环会做如下三件事情(参考<code>AbstractIOReactor.execute</code>函数);<br>1.调用<code>processEvents</code>检查新的write、read事件。<br>2.调用<code>validate</code>判断是否有查询超时,超时参数通过setSocketTimeout参数设置<br>3.调用<code>processNewChannels</code>处理boss线程传递的新创建的管道及请求。<br>我们先看下<code>AbstractIOReactor.processNewChannels()</code>如何处理新创建的管道及请求的。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br></pre></td><td class="code"><pre><span class="line">private void processNewChannels() throws IOReactorException {</span><br><span class="line"> ChannelEntry entry;</span><br><span class="line"> // 轮循每个新产生的请求及对应的管道</span><br><span class="line"> while ((entry = this.newChannels.poll()) != null) {</span><br><span class="line"></span><br><span class="line"> final SocketChannel channel;</span><br><span class="line"> final SelectionKey key;</span><br><span class="line"> try {</span><br><span class="line"> channel = entry.getChannel();</span><br><span class="line"> channel.configureBlocking(false);</span><br><span class="line"> // SelectionKeyImpl,都注册read事件</span><br><span class="line"> key = channel.register(this.selector, SelectionKey.OP_READ); </span><br><span class="line"> }</span><br><span class="line"> final SessionClosedCallback sessionClosedCallback = new SessionClosedCallback() {</span><br><span class="line"> @Override</span><br><span class="line"> public void sessionClosed(final IOSession session) {</span><br><span class="line"> queueClosedSession(session);</span><br><span class="line"> }</span><br><span class="line"> };</span><br><span class="line"> InterestOpsCallback interestOpsCallback = null;</span><br><span class="line"> final IOSession session;</span><br><span class="line"> try {</span><br><span class="line"> // IOSessionImpl与key是绑定的,因为key是重复利用的,所以IOSessionImpl也是重复利用的</span><br><span class="line"> session = new IOSessionImpl(key, interestOpsCallback, sessionClosedCallback);</span><br><span class="line"> int timeout = 0;</span><br><span class="line"> try {</span><br><span class="line"> timeout = channel.socket().getSoTimeout();</span><br><span class="line"> } catch (final IOException ex) {</span><br><span class="line"> // Very unlikely to happen and is not fatal</span><br><span class="line"> // as the protocol layer is expected to overwrite</span><br><span class="line"> // this value anyways</span><br><span class="line"> }</span><br><span class="line"> // 设置http.session.attachment</span><br><span class="line"> session.setAttribute(IOSession.ATTACHMENT_KEY, entry.getAttachment()); </span><br><span class="line"> // 设置超时</span><br><span class="line"> session.setSocketTimeout(timeout);</span><br><span class="line"> } </span><br><span class="line"> try {</span><br><span class="line"> // 一个新的上下文请求</span><br><span class="line"> this.sessions.add(session);</span><br><span class="line"> // 将这个IOSessionImpl放入SelectionKeyImpl中 </span><br><span class="line"> key.attach(session);</span><br><span class="line"> final SessionRequestImpl sessionRequest = entry.getSessionRequest();</span><br><span class="line"> if (sessionRequest != null) {</span><br><span class="line"> if (!sessionRequest.isTerminated()) {</span><br><span class="line"> //1.产生了connection,2.往AbstractNIOConnPool.leased放入CPoolEntry.3.设置可写事件</span><br><span class="line"> sessionRequest.completed(session);</span><br><span class="line"> }</span><br><span class="line"> if (!sessionRequest.isTerminated()) {</span><br><span class="line"> // 进来设置write事件了</span><br><span class="line"> sessionCreated(key, session);</span><br><span class="line"> }</span><br><span class="line"> if (sessionRequest.isTerminated()) {</span><br><span class="line"> throw new CancelledKeyException();</span><br><span class="line"> }</span><br><span class="line"> } else {</span><br><span class="line"> sessionCreated(key, session);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>该函数主要做了如下事情:<br>1.对该管道添加<code>SelectionKey.OP_READ</code>事件<br>2.创建IOSessionImpl对象,需要注意,该对象生命周期与SocketChannelImpl绑定的。<br>3.调用<code>sessionRequest.completed</code>:</p>
<ul>
<li>产生<code>ManagedNHttpClientConnectionImpl</code>管道。</li>
<li>构建CPoolEntry。</li>
<li>向该管道对应的<code>IOSessionImpl.attributes</code>增加<code>http.nio.exchange-handler</code>,将请求内容<code>DefaultClientExchangeHandlerImpl</code>与该管道绑定。</li>
<li>并将该管道增加<code>SelectionKey.OP_WRITE</code>感兴趣的事件<blockquote>
<p>注意:</p>
</blockquote>
</li>
</ul>
<p>1.调用<code>AbstractClientExchangeHandler.connectionAllocated</code>表示ManagedNHttpClientConnectionImpl管道已经就绪,就等待worker发送请求,该函数将在两个地方调用:1.主函数从空闲列表中申请到可用管道。2.worker线程接到boss线程创建的SocketChannleImpl后创建了ManagedNHttpClientConnectionImpl管道。<br>2.每个管道在数据发送前,会通过<code>http.nio.exchange-handler</code>属性,与请求绑定。每个管道就是一个连接并发,每次只能发送一次请求,只有当上一个请求结束后,该管道才会分配给下个请求。<br>3.需要说下,为啥我们不可以直接发送请求、而再来注册SelectionKey.OP_WRITE事件呢?注册后, 系统会去检查内核写缓冲区是否写满了, 若写满了,会发送失败的情况。<br>4.此时管道已经注册了<code>SelectionKey.OP_READ</code>和<code>SelectionKey.OP_WRITE</code>事件。</p>
<p>我们再看下<code>AbstractIOReactor.processEvents</code>如何处理事件的。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br></pre></td><td class="code"><pre><span class="line">private void processEvents(final Set<SelectionKey> selectedKeys) {</span><br><span class="line"> for (final SelectionKey key : selectedKeys) {</span><br><span class="line"> processEvent(key);</span><br><span class="line"> }</span><br><span class="line"> selectedKeys.clear();</span><br><span class="line">}</span><br><span class="line">protected void processEvent(final SelectionKey key) {</span><br><span class="line"> // 直接通过IOsessionImpl获取元数据,复用时,</span><br><span class="line"> final IOSessionImpl session = (IOSessionImpl) key.attachment();</span><br><span class="line"> try {</span><br><span class="line"> if (key.isAcceptable()) {// accept事件</span><br><span class="line"> acceptable(key); // 啥都不干</span><br><span class="line"> }</span><br><span class="line"> if (key.isConnectable()) { // connect事件</span><br><span class="line"> connectable(key); // 啥都不干</span><br><span class="line"> }</span><br><span class="line"> if (key.isReadable()) { // 读事件</span><br><span class="line"> session.resetLastRead();</span><br><span class="line"> readable(key);</span><br><span class="line"> }</span><br><span class="line"> if (key.isWritable()) {// 里面注册了可写事件</span><br><span class="line"> session.resetLastWrite();</span><br><span class="line"> writable(key); // 真正写数据</span><br><span class="line"> }</span><br><span class="line"> } catch (final CancelledKeyException ex) {</span><br><span class="line"> queueClosedSession(session);</span><br><span class="line"> key.attach(null);</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>这里主要关注的是<code>write</code>、<code>read</code>事件,针对<code>accept</code>、<code>connect</code>直接丢弃,<code>read</code>响应在下一章详细介绍。我们继续看下监听到<code>write</code>后发生了什么事情,发送数据时会跑到<code>DefaultNHttpClientConnection.produceOutput</code>这里:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br></pre></td><td class="code"><pre><span class="line">public void produceOutput(final NHttpClientEventHandler handler) {// HttpAsyncRequestExecutor</span><br><span class="line"> try {</span><br><span class="line"> if (this.status == ACTIVE) {</span><br><span class="line"> if (this.contentEncoder == null && !this.outbuf.hasData()) {</span><br><span class="line"> handler.requestReady(this);</span><br><span class="line"> }</span><br><span class="line"> // 编码请求,默认使用LengthDelimitedEncoder进行编码</span><br><span class="line"> if (this.contentEncoder != null) {</span><br><span class="line"> handler.outputReady(this, this.contentEncoder);</span><br><span class="line"> if (this.contentEncoder.isCompleted()) {</span><br><span class="line"> resetOutput();</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> if (this.outbuf.hasData()) {</span><br><span class="line"> // 真正向管道中刷数据了</span><br><span class="line"> final int bytesWritten = this.outbuf.flush(this.session.channel());</span><br><span class="line"> if (bytesWritten > 0) {</span><br><span class="line"> this.outTransportMetrics.incrementBytesTransferred(bytesWritten);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> if (!this.outbuf.hasData()) {// 若没有数据了</span><br><span class="line"> if (this.status == CLOSING) {</span><br><span class="line"> this.session.close();</span><br><span class="line"> this.status = CLOSED;</span><br><span class="line"> resetOutput();</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> } finally {</span><br><span class="line"> // Finally set the buffered output flag</span><br><span class="line"> this.hasBufferedOutput = this.outbuf.hasData();</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>主要做了两件事:<br>1.针对body使用<code>LengthDelimitedEncoder</code>进行编码。<br>2.调用<code>this.outbuf.flush()</code>将编码内容从SocketChannel真正发送出去。</p>
<h2 id="http响应阶段"><a href="#http响应阶段" class="headerlink" title="http响应阶段"></a>http响应阶段</h2><p>http响应阶段在<code>AbstractIOReactor.processEvents</code>的<code>key.isReadable()</code>处接受响应,会进入到<code>BaseIOReactor.readable()</code>中。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br></pre></td><td class="code"><pre><span class="line">protected void readable(final SelectionKey key) {</span><br><span class="line"> //获取这个key绑定的IOSessionImpl(在key与管道,IOSessionImpl都是绑定一起的)</span><br><span class="line"> final IOSession session = getSession(key); </span><br><span class="line"> try {</span><br><span class="line"> // Try to gently feed more data to the event dispatcher</span><br><span class="line"> // if the session input buffer has not been fully exhausted</span><br><span class="line"> // (the choice of 5 iterations is purely arbitrary)</span><br><span class="line"> for (int i = 0; i < 5; i++) {</span><br><span class="line"> // 实现类是InternalIODispatch</span><br><span class="line"> this.eventDispatch.inputReady(session);</span><br><span class="line"> if (!session.hasBufferedInput()</span><br><span class="line"> || (session.getEventMask() & SelectionKey.OP_READ) == 0) {</span><br><span class="line"> break;</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> if (session.hasBufferedInput()) {</span><br><span class="line"> this.bufferingSessions.add(session);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>从管道读取时,会循环5次(一般调用一次<code>InternalIODispatch.inputReady</code>就读取完数据了),直到读取完数据。读取数据会进入到<code>DefaultNHttpClientConnection.consumeInput</code>:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br></pre></td><td class="code"><pre><span class="line">public void consumeInput(final NHttpClientEventHandler handler) {// HttpAsyncRequestExecutor</span><br><span class="line"> try {</span><br><span class="line"> if (this.response == null) {</span><br><span class="line"> int bytesRead;</span><br><span class="line"> // 循环读取,直到读取完成</span><br><span class="line"> do {</span><br><span class="line"> // 首先读8k</span><br><span class="line"> bytesRead = this.responseParser.fillBuffer(this.session.channel());</span><br><span class="line"> if (bytesRead > 0) {</span><br><span class="line"> this.inTransportMetrics.incrementBytesTransferred(bytesRead);</span><br><span class="line"> }</span><br><span class="line"> //BasicHttpResponse,解析了如何读取http的字节流</span><br><span class="line"> this.response = this.responseParser.parse();</span><br><span class="line"> } while (bytesRead > 0 && this.response == null);</span><br><span class="line"> if (this.response != null) {</span><br><span class="line"> if (this.response.getStatusLine().getStatusCode() >= 200) {</span><br><span class="line"> // 这里才会产生一个createContentDecoder</span><br><span class="line"> final HttpEntity entity = prepareDecoder(this.response);</span><br><span class="line"> this.response.setEntity(entity);</span><br><span class="line"> this.connMetrics.incrementResponseCount();</span><br><span class="line"> }</span><br><span class="line"> this.hasBufferedInput = this.inbuf.hasData();</span><br><span class="line"> onResponseReceived(this.response);// 没用</span><br><span class="line"> handler.responseReceived(this);//从管道中读取完数据后,handler=HttpAsyncRequestExecutor</span><br><span class="line"> if (this.contentDecoder == null) {</span><br><span class="line"> resetInput();</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> if (bytesRead == -1 && !this.inbuf.hasData()) {</span><br><span class="line"> handler.endOfInput(this);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> if (this.contentDecoder != null && (this.session.getEventMask() & SelectionKey.OP_READ) > 0) {</span><br><span class="line"> // 1.读取body,2.会存在释放管道的行为.3.响应用户</span><br><span class="line"> handler.inputReady(this, this.contentDecoder);</span><br><span class="line"> if (this.contentDecoder.isCompleted()) {</span><br><span class="line"> // Response entity received</span><br><span class="line"> // Ready to receive a new response</span><br><span class="line"> resetInput();</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> } finally {</span><br><span class="line"> // Finally set buffered input flag</span><br><span class="line"> this.hasBufferedInput = this.inbuf.hasData();</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>读取过程做了如下事情:<br>1.首先调用<code>responseParser.fillBuffer()</code>从管道中读取8KB的字节流出来,接着调用<code>AbstractMessageParser.parse()</code>解析http的头部数据。<br>2.调用<code>NHttpConnectionBase.prepareDecoder()</code>,从header的content-length解析出content的长度。并产生解析数据使用的<code>LengthDelimitedDecoder</code>, 此时8k字节流buffer也放入了LengthDelimitedDecoder中。<br>3.调用<code>HttpAsyncRequestExecutor.responseReceived</code>根据content-length来初始化接收响应使用的buffer[],默认使用HeapByteBufferAllocator.INSTANCE。<br>4.调用<code>HttpAsyncRequestExecutor.inputReady()</code>来组装整个content(实际会进入<code>SimpleInputBuffer.consumeContent()从channel中读取</code>, 解析数据使用的<code>LengthDelimitedDecoder</code>;然后调用<code>HttpAsyncRequestExecutor.processResponse()</code>释放管道,响应用户。</p>
<p>我们看下<code>AbstractMessageParser.parse()</code>如何解析http头部的:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br></pre></td><td class="code"><pre><span class="line">public T parse() throws IOException, HttpException {</span><br><span class="line"> while (this.state != COMPLETED) {</span><br><span class="line"> if (this.lineBuf == null) {// 先读取最开头的"HTTP/1.1 200 OK"</span><br><span class="line"> this.lineBuf = new CharArrayBuffer(64);</span><br><span class="line"> } else {</span><br><span class="line"> this.lineBuf.clear(); // 清空接着用</span><br><span class="line"> }</span><br><span class="line"> // 若没有结束的话,每次读取一行,若读取的是\r\n,经过过滤,长度就变成了0,说明headers就读取完了</span><br><span class="line"> final boolean lineComplete = this.sessionBuffer.readLine(this.lineBuf, this.endOfStream);// 从sessionBuffer中读取一行</span><br><span class="line"> final int maxLineLen = this.constraints.getMaxLineLength();</span><br><span class="line"> if (maxLineLen > 0 &&</span><br><span class="line"> (this.lineBuf.length() > maxLineLen ||</span><br><span class="line"> (!lineComplete && this.sessionBuffer.length() > maxLineLen))) {</span><br><span class="line"> throw new MessageConstraintException("Maximum line length limit exceeded");</span><br><span class="line"> }</span><br><span class="line"> if (!lineComplete) {</span><br><span class="line"> break;</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> switch (this.state) {//more是0</span><br><span class="line"> case READ_HEAD_LINE:// read_head_line</span><br><span class="line"> try {</span><br><span class="line"> // 算是解析HTTP/1.1 200 OK</span><br><span class="line"> parseHeadLine();</span><br><span class="line"> } catch (final ParseException px) {</span><br><span class="line"> throw new ProtocolException(px.getMessage(), px);</span><br><span class="line"> }</span><br><span class="line"> this.state = READ_HEADERS;// read_headers</span><br><span class="line"> break;</span><br><span class="line"> case READ_HEADERS:// read_headers</span><br><span class="line"> if (this.lineBuf.length() > 0) {</span><br><span class="line"> // 若读取长度为0,就说明读取完了</span><br><span class="line"> final int maxHeaderCount = this.constraints.getMaxHeaderCount();</span><br><span class="line"> if (maxHeaderCount > 0 && headerBufs.size() >= maxHeaderCount) {</span><br><span class="line"> throw new MessageConstraintException("Maximum header count exceeded");</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> parseHeader();</span><br><span class="line"> } else {</span><br><span class="line"> this.state = COMPLETED;</span><br><span class="line"> }</span><br><span class="line"> break;</span><br><span class="line"> }</span><br><span class="line"> if (this.endOfStream && !this.sessionBuffer.hasData()) {</span><br><span class="line"> this.state = COMPLETED;</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> if (this.state == COMPLETED) {</span><br><span class="line"> for (final CharArrayBuffer buffer : this.headerBufs) {</span><br><span class="line"> try {</span><br><span class="line"> // 开始解析header</span><br><span class="line"> this.message.addHeader(lineParser.parseHeader(buffer));</span><br><span class="line"> } catch (final ParseException ex) {</span><br><span class="line"> throw new ProtocolException(ex.getMessage(), ex);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> return this.message;</span><br><span class="line"> }</span><br><span class="line"> return null;</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>这里涉及到读取state的转变,转变过程如下:<br><img src="https://kkewwei.github.io/elasticsearch_learning/img/httpasycnclient5.png" height="150" width="470"><br>字节流前缀如下:<code>HTTP/1.1 200 OK\r\nheaders\r\n\r\ncontents</code>,可以看到,header与content之间以两个<code>\r\n</code>为分隔符,<code>AbstractMessageParser.parse()</code>就是解析http content之前的内容。</p>
<p>我们再看下<code>SimpleInputBuffer.consumeContent()</code>如何组装整个content。在前面读取headers时,直接从SocketChannelImpl读取了8k字节流,此时仅仅读取了http header部分,8k中也包含了部分content内容,这里也会一起读取出来。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br></pre></td><td class="code"><pre><span class="line">public int consumeContent(final ContentDecoder decoder) throws IOException {</span><br><span class="line"> // 重新读取</span><br><span class="line"> setInputMode(); </span><br><span class="line"> int totalRead = 0;</span><br><span class="line"> int bytesRead; </span><br><span class="line"> // 可以申请多大的DirectBuffer,就读取多少数据</span><br><span class="line"> while ((bytesRead = decoder.read(this.buffer)) != -1) {</span><br><span class="line"> if (bytesRead == 0) { </span><br><span class="line"> if (!this.buffer.hasRemaining()) {</span><br><span class="line"> expand();</span><br><span class="line"> } else {</span><br><span class="line"> break;</span><br><span class="line"> }</span><br><span class="line"> } else {</span><br><span class="line"> totalRead += bytesRead; // 每次只能读取185472b左右的数据,若多了,这里while也读取不完</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> if (bytesRead == -1 || decoder.isCompleted()) {</span><br><span class="line"> this.endOfStream = true;</span><br><span class="line"> }</span><br><span class="line"> return totalRead;</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>这里就比较简单了,就是循环调用<code>decoder.read</code>来从管道中读取剩余的字节流了。而<code>decoder.read</code>读取http content部分如下:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br></pre></td><td class="code"><pre><span class="line">public int read(final ByteBuffer dst) throws IOException {</span><br><span class="line"> final int chunk = (int) Math.min((this.contentLength - this.len), Integer.MAX_VALUE);</span><br><span class="line"> final int bytesRead;</span><br><span class="line"> if (this.buffer.hasData()) {</span><br><span class="line"> final int maxLen = Math.min(chunk, this.buffer.length());</span><br><span class="line"> bytesRead = this.buffer.read(dst, maxLen);</span><br><span class="line"> } else {</span><br><span class="line"> // 一次读取多少文档,取决于从DirectBufferCache中申请的DirectDuffer大小</span><br><span class="line"> bytesRead = readFromChannel(dst, chunk);</span><br><span class="line"> }</span><br><span class="line"> this.len += bytesRead;</span><br><span class="line"> if (this.len >= this.contentLength) {</span><br><span class="line"> setCompleted();</span><br><span class="line"> }</span><br><span class="line"> return isCompleted() && bytesRead == 0 ? -1 : bytesRead;</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>该函数主要做了如下事情:<br>1.首先将之前8k中未读取的content放入dst中<br>2.再依次从管道中读取剩余所有的content放入dst中。<br>此时整个content部分也读取完成了。</p>
<h1 id="总结"><a href="#总结" class="headerlink" title="总结"></a>总结</h1><p>http请求发送时存在3种可能,1.连接池无可用管道,连接也没达到上限,那么将请求交给boss线程新建管道,再交给worker线程发送请求。2.连接池有可用管道,那么直接将请求交给worker发送。3.连接池无可用管道,且连接个数已达上限,那么请求阻塞等待。每个管道一次只能发送一次请求,下个请求只能等当前请求完成、管道释放后才能进行,通过管道个数来限制连接并发,导致管道利用率不高,这里也许可以进行部分优化。</p>
</div>
<div class="article-info article-info-index">
<div class="article-category tagcloud">
<i class="icon-book icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="/elasticsearch_learning/categories/Java学习//" class="article-tag-list-link color2">Java学习</a>
</li>
</ul>
</div>
<p class="article-more-link">
<a class="article-more-a" href="/elasticsearch_learning/2021/11/05/HTTP异步Client源码解析/">展开全文 >></a>
</p>
<div class="clearfix"></div>
</div>
</div>
</article>
<aside class="wrap-side-operation">
<div class="mod-side-operation">
<div class="jump-container" id="js-jump-container" style="display:none;">
<a href="javascript:void(0)" class="mod-side-operation__jump-to-top">
<i class="icon-font icon-back"></i>
</a>
<div id="js-jump-plan-container" class="jump-plan-container" style="top: -11px;">
<i class="icon-font icon-plane jump-plane"></i>
</div>
</div>
</div>
</aside>
<article id="post-ES-cat-nodes接口无响应问题定位" class="article article-type-post article-index" itemscope itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/elasticsearch_learning/2021/08/20/ES-cat-nodes接口无响应问题定位/">ES _cat/nodes接口无响应问题定位</a>
</h1>
<a href="/elasticsearch_learning/2021/08/20/ES-cat-nodes接口无响应问题定位/" class="archive-article-date">
<time datetime="2021-08-20T04:21:28.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2021-08-20</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
<h1 id="现象"><a href="#现象" class="headerlink" title="现象"></a>现象</h1><p>目前对ES集群的报警的一个重要接口是_cat/nodes,在线上环境,该接口经常超时无响应。</p>
<h1 id="定位"><a href="#定位" class="headerlink" title="定位"></a>定位</h1><h2 id="根据代码入手"><a href="#根据代码入手" class="headerlink" title="根据代码入手"></a>根据代码入手</h2><figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br></pre></td><td class="code"><pre><span class="line"> final ClusterStateRequest clusterStateRequest = new ClusterStateRequest();</span><br><span class="line"> clusterStateRequest.clear().nodes(true);</span><br><span class="line"> return channel -> client.admin().cluster().state(clusterStateRequest, new RestActionListener<ClusterStateResponse>(channel) { </span><br><span class="line"> @Override </span><br><span class="line"> public void processResponse(final ClusterStateResponse clusterStateResponse) { </span><br><span class="line"> ......</span><br><span class="line"> // 向每个节点发送将发送_nodes/{nodeId}</span><br><span class="line"> client.admin().cluster().nodesInfo(nodesInfoRequest, new RestActionListener<NodesInfoResponse>(channel) {</span><br><span class="line"> @Override</span><br><span class="line"> public void processResponse(final NodesInfoResponse nodesInfoResponse) {</span><br><span class="line"> ...</span><br><span class="line"> // 向每个节点发送将发送_nodes/stats</span><br><span class="line"> client.admin().cluster().nodesStats(nodesStatsRequest, new RestResponseListener<NodesStatsResponse>(channel) {</span><br><span class="line"> @Override // </span><br><span class="line"> public RestResponse buildResponse(NodesStatsResponse nodesStatsResponse) throws Exception {</span><br><span class="line"> return ...</span><br><span class="line"> }</span><br><span class="line"> });</span><br><span class="line"> }</span><br><span class="line"> });</span><br><span class="line"> }</span><br><span class="line"> });</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>当client 发送_cat/nodes请求后,协调节点依次做如下3件事情:<br>1.协调节点向master请求_cluster/state集群,以便获取所有节点,其中超时30s。<br>2.协调节点向每个节点发送/_nodes/{nodeId}请求,获取所有节点的运行信息,包括jvm,os,http,process等运行信息,没有超时时间设置。<br>3.协调节点向每个节点发送/_nodes/{nodeId}/stats请求,获取所有节点当前jvm,os,fs,process运行指标,没有超时时间设置。<br>结论: 首先怀疑肯定是第2、3步出问题,在获取某个指标的时候某些节点超时无响应了。</p>
<h2 id="硬件资源入手"><a href="#硬件资源入手" class="headerlink" title="硬件资源入手"></a>硬件资源入手</h2><p>节点处问题期间,master日志也报如下日志:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">[2021-08-12T11:41:16,407][WARN ][o.e.t.TransportService ] [rz-data-hdp-dn-rtyarn0108] Received response for a request that has timed out, sent [80029ms] ago, timed out [65024ms] ago, action [cluster:monitor/nodes/stats[n]], node [{data1}], id [95175528]</span><br></pre></td></tr></table></figure>
<p>大致可以猜测是这个节点导致的接口超时。观察硬件指标如下:<br>内存:节点内存占用率并不高,可能才50%-70%<br>cpu:cpu也没有完全使用完,不到70%<br>io:部分节点的io使用率持续性达到了100%<br>结论: 此时基于以上指标,可以断定是因为IO持续达到100%,导致的请求无响应。</p>
<h2 id="再具体分析每步请求"><a href="#再具体分析每步请求" class="headerlink" title="再具体分析每步请求"></a>再具体分析每步请求</h2><p>1.分析第一步<br>协调节点向集群master发送请求,获取所有节点。此时master所有返回结果都是从内存中拿的,不涉及到IO,且master负载较低。肯定不是第一步出现了问题。<br>2.分析第二步<br>每个节点在响应时,会获取本地jvm,os,http,process等初始化信息,但是这些信息在ES启动时就已经完成初始化了,之后每次请求也都是从内存中获取现成的,不涉及到IO操作,看起来也不是第2步出现了问题。<br>3.分析第三步<br>每个节点在响应时,会获取本地jvm,os,fs,process等运行指标,这些指标都是实时获取的,其中fs、jvm等运行指标会与jvm,磁盘打交道,容易受节点压力大导致响应超时等现象。为了复现这些指标的获取,甚至将这些指标单独摘取出来,在IO负载高的机器上单独运行,指标都获取很快,并没有出现超时的现象。<br>结论</p>
<p>理论分析看起来完全行不通,从代码层面解释不清楚为啥接口超时的情况</p>
<h2 id="测试"><a href="#测试" class="headerlink" title="测试"></a>测试</h2><h3 id="测试1"><a href="#测试1" class="headerlink" title="测试1"></a>测试1</h3><p>虽然第3步指标获取单独跳出来,测试没啥问题,可能分析没把重要的一步给找到,那么我们就开发特定代码,将第3步中的指标获取全部取消,这样确定第3步只涉及到内存操作,理论上超时现象应该可以极大缓解。<br>经过线上压力不大的集群进行_cat/nodes线上测试:</p>
<ul>
<li>正常接口下,平均请求耗时在3s+</li>
<li>特定代码的平均耗时在0.5s+<br>看起来的确找到了问题。</li>
</ul>
<p>经过线上压力较大的集群进行_cat/nodes线上测试,理论上效果也良好:</p>
<ul>
<li>正常接口下,平均请求耗时在25s+</li>
<li>特定代码的平均耗时在25s+<br>特定代码看起来 并没有一丝变好。<br>结论:确定第三步中实时fs,jvm等指标获取并不耗时。</li>
</ul>
<h3 id="测试2"><a href="#测试2" class="headerlink" title="测试2"></a>测试2</h3><p>模仿协调节点依次向master发送_cluster/state,每个数据节点发送/_nodes/{nodeId}、/_nodes/{nodeId}/stats请求,发现协调节点在/_nodes/{nodeId}时,IO负载高的节点总是超时,有时1min+都无响应,确定是这步操作出问题了。我们又仔细核对了第二步响应的指标,的确都是从内存中获取的,完全与IO无关。<br>结论:第二步的确出问题了,但是与IO,内存都无关。</p>
<h3 id="测试3"><a href="#测试3" class="headerlink" title="测试3"></a>测试3</h3><p>我们尝试向问题节点发送_cat/pending_tasks等请求,发现这些请求响应非常快。他们区别就是可能使用的线程池不一样,第一反应就是问题节点接收/_nodes/{nodeId}/stats的线程池被打满了。</p>
<ul>
<li>经过确定,问题节点处理/_nodes/{nodeId}/stats请求时使用的是management线程池,而_cat/pending_tasks没有使用线程池,线程池的确是有区别</li>
<li>获取线上每个节点线程池的使用请求:<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line">"management" : {</span><br><span class="line"> "threads" : 5,</span><br><span class="line"> "queue" : 793,</span><br><span class="line"> "active" : 5,</span><br><span class="line"> "rejected" : 0,</span><br><span class="line"> "largest" : 5,</span><br><span class="line"> "completed" : 888436</span><br><span class="line">},</span><br></pre></td></tr></table></figure></li>
</ul>
<p>发现该类线程池存在严重的堆积情况,工作线程只有5个,但是排队的线程个数达到了793个,的确验证了线程池被打满,导致请求无法响应的情况。</p>
<ul>
<li>我们看下management线程在做哪些事情呢:<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br></pre></td><td class="code"><pre><span class="line">"elasticsearch[data1][management][T#5]" #176 daemon prio=5 os_prio=0 tid=0x00007f7854003800 nid=0x7e2 runnable [0x00007f79ab8f8000]</span><br><span class="line"> java.lang.Thread.State: RUNNABLE</span><br><span class="line"> at sun.nio.ch.FileDispatcherImpl.force0(Native Method)</span><br><span class="line"> at sun.nio.ch.FileDispatcherImpl.force(FileDispatcherImpl.java:76)</span><br><span class="line"> at sun.nio.ch.FileChannelImpl.force(FileChannelImpl.java:388)</span><br><span class="line"> at org.apache.lucene.util.IOUtils.fsync(IOUtils.java:471)</span><br><span class="line"> at org.apache.lucene.store.FSDirectory.syncMetaData(FSDirectory.java:309)</span><br><span class="line"> at org.elasticsearch.gateway.MetadataStateFormat.performStateDirectoriesFsync(MetadataStateFormat.java:172)</span><br><span class="line"> at org.elasticsearch.gateway.MetadataStateFormat.write(MetadataStateFormat.java:246)</span><br><span class="line"> at org.elasticsearch.gateway.MetadataStateFormat.writeAndCleanup(MetadataStateFormat.java:185)</span><br><span class="line"> at org.elasticsearch.index.seqno.ReplicationTracker.persistRetentionLeases(ReplicationTracker.java:494)</span><br><span class="line"> - locked <0x00007f7c4981d978> (a java.lang.Object)</span><br><span class="line"> at org.elasticsearch.index.shard.IndexShard.persistRetentionLeases(IndexShard.java:2256)</span><br><span class="line"> at org.elasticsearch.index.seqno.RetentionLeaseBackgroundSyncAction.lambda$shardOperationOnPrimary$0(RetentionLeaseBackgroundSyncAction.java:161)</span><br><span class="line"> at org.elasticsearch.index.seqno.RetentionLeaseBackgroundSyncAction$$Lambda$5355/1679727689.get(Unknown Source)</span><br><span class="line"> at org.elasticsearch.action.ActionListener.completeWith(ActionListener.java:325)</span><br><span class="line"> at org.elasticsearch.index.seqno.RetentionLeaseBackgroundSyncAction.shardOperationOnPrimary(RetentionLeaseBackgroundSyncAction.java:157)</span><br><span class="line"> at org.elasticsearch.index.seqno.RetentionLeaseBackgroundSyncAction.shardOperationOnPrimary(RetentionLeaseBackgroundSyncAction.java:64)</span><br><span class="line"> at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform(TransportReplicationAction.java:968)</span><br><span class="line"> at org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:122)</span><br><span class="line"> at org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.runWithPrimaryShardReference(TransportReplicationAction.java:429)</span><br></pre></td></tr></table></figure></li>
</ul>
<p>发现management线程在被别的命令拿来使用IO落盘操作,因为IO压力大,导致落盘效率极低,导致请求出现严重堆积情况。问题原因彻底找到了。扫描整个代码中,management线程池在许多接口中使用。<br>目前<a href="https://github.com/elastic/elasticsearch/pull/62753#issuecomment-948551092" target="_blank" rel="noopener">向社区反馈</a>,社区还未给出解决办法。</p>
<h1 id="结论"><a href="#结论" class="headerlink" title="结论"></a>结论</h1><p>协调节点使用management线程池处理/_nodes/{nodeId}/stats请求,但是management线程池与其他接口一起共用,其他接口有落盘操作,IO打满导致落盘落不下去,极大降低了management线程工作效率,进而影响到_cat/nodes请求。根本原因还是IO压力大导致的。</p>
</div>
<div class="article-info article-info-index">
<div class="article-tag tagcloud">
<i class="icon-price-tags icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="javascript:void(0)" class="js-tag article-tag-list-link color3">_cat/nodes接口</a>
</li>
</ul>
</div>
<div class="article-category tagcloud">
<i class="icon-book icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="/elasticsearch_learning/categories/Elasticsearch//" class="article-tag-list-link color4">Elasticsearch</a>
</li>
</ul>
</div>
<p class="article-more-link">
<a class="article-more-a" href="/elasticsearch_learning/2021/08/20/ES-cat-nodes接口无响应问题定位/">展开全文 >></a>
</p>
<div class="clearfix"></div>
</div>
</div>
</article>
<aside class="wrap-side-operation">
<div class="mod-side-operation">
<div class="jump-container" id="js-jump-container" style="display:none;">
<a href="javascript:void(0)" class="mod-side-operation__jump-to-top">
<i class="icon-font icon-back"></i>
</a>
<div id="js-jump-plan-container" class="jump-plan-container" style="top: -11px;">
<i class="icon-font icon-plane jump-plane"></i>
</div>
</div>
</div>
</aside>
<article id="post-Lucene8-6-2底层架构-Segment-StoredFields合并原理详解" class="article article-type-post article-index" itemscope itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/elasticsearch_learning/2021/06/13/Lucene8-6-2底层架构-Segment-StoredFields合并原理详解/">Lucene8.6.2底层架构-Segment StoredFields合并原理详解</a>
</h1>
<a href="/elasticsearch_learning/2021/06/13/Lucene8-6-2底层架构-Segment-StoredFields合并原理详解/" class="archive-article-date">
<time datetime="2021-06-13T09:09:57.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2021-06-13</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
<p>在<a href="https://kkewwei.github.io/elasticsearch_learning/2019/10/17/ES%E6%AE%B5%E5%90%88%E5%B9%B6%E6%BA%90%E7%A0%81%E5%88%86%E6%9E%90/">ES段合并逻辑分析</a>我们详细介绍了如何进行查找一个合并的段,本文主要讲解合并线程的Merge内如何进行StoredFields合并的。StoredFields合并过程其实新的StoredFields构建过程,构建过程可参考<a href="https://kkewwei.github.io/elasticsearch_learning/2019/10/29/Lucenec%E5%BA%95%E5%B1%82%E6%9E%B6%E6%9E%84-fdt-fdx%E6%9E%84%E5%BB%BA%E8%BF%87%E7%A8%8B/">Lucene8.2.0底层架构-fdt/fdx构建过程</a>,本文以Lucene8.6.2结构为基础,StoredFields底层索引结构相对Lucene8.2.0有较小改变,为了便于本文详解,这里首先贴出StoredFields底层文件的索引结构:<br><img src="https://kkewwei.github.io/elasticsearch_learning/img/lucene7.8.2_segment_fdt1.png" height="400" width="1000"></p>
<h1 id="构建CompressingStoredFieldsWriter对象"><a href="#构建CompressingStoredFieldsWriter对象" class="headerlink" title="构建CompressingStoredFieldsWriter对象"></a>构建CompressingStoredFieldsWriter对象</h1><p>merge的过程就是将几个Segment重新合并、写到一个segment中,其中必然需要先创建一个新的Merge,并构建新Segment storedFields写入的Writer,本文将以<code>SegmentMerger.mergeFields()</code>开始介绍:首先构建CompressingStoredFieldsWriter对象,然后进行StoredFields的写入:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">private int mergeFields() throws IOException {</span><br><span class="line"> try (StoredFieldsWriter fieldsWriter = codec.storedFieldsFormat().fieldsWriter(directory, mergeState.segmentInfo, context)) {</span><br><span class="line"> return fieldsWriter.merge(mergeState);</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<h1 id="开始合并"><a href="#开始合并" class="headerlink" title="开始合并"></a>开始合并</h1><p>merge过程是整个StoredFields写入的核心代码:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br></pre></td><td class="code"><pre><span class="line">public int merge(MergeState mergeState) throws IOException {</span><br><span class="line"> // 统计当前merge中所有segment中包含的文档数</span><br><span class="line"> int docCount = 0; </span><br><span class="line"> int numReaders = mergeState.maxDocs.length; // 多少个semgent来合并</span><br><span class="line"> MatchingReaders matching = new MatchingReaders(mergeState);</span><br><span class="line"> if (mergeState.needsIndexSort) {</span><br><span class="line"> ....</span><br><span class="line"> }</span><br><span class="line"> for (int readerIndex=0;readerIndex<numReaders;readerIndex++) {</span><br><span class="line"> ......</span><br><span class="line"> // 这个segment的文档ID</span><br><span class="line"> final int maxDoc = mergeState.maxDocs[readerIndex];</span><br><span class="line"> final Bits liveDocs = mergeState.liveDocs[readerIndex];</span><br><span class="line"> // 一般跳过if</span><br><span class="line"> // if its some other format, or an older version of this format, or safety switch:</span><br><span class="line"> if (matchingFieldsReader == null || matchingFieldsReader.getVersion() != VERSION_CURRENT </span><br><span class="line"> || BULK_MERGE_ENABLED == false) {</span><br><span class="line"> ......</span><br><span class="line"> } else if (matchingFieldsReader.getCompressionMode() == compressionMode &&</span><br><span class="line"> matchingFieldsReader.getChunkSize() == chunkSize && </span><br><span class="line"> matchingFieldsReader.getPackedIntsVersion() == PackedInts.VERSION_CURRENT &&</span><br><span class="line"> liveDocs == null &&</span><br><span class="line"> !tooDirty(matchingFieldsReader)) {</span><br><span class="line"> 批量读取StoredFields并写入新的StoredFields中 </span><br><span class="line"> } else {</span><br><span class="line"> 读取每个doc,将doc写入新的StoredFields中</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> finish(mergeState.mergeFieldInfos, docCount);</span><br><span class="line"> return docCount;</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>该方法会轮循每个segment,检查每个StoredFields:是否有删除或者、或者脏chunk个数大于1024个、或者脏chunk占比该segment大于1%。<br>1.若不成立,则进入<code>else if</code>批量读取旧StoredFields中的chunk写到新的StoredFields中<br>2.若成立:那么就会进入<code>else</code>读取每个文档,重新读取每个doc, 通过CompressingStoredFieldsWriter逐个将文档写入内存中;<br>3.最终将缓存文档刷成StoredFields.</p>
<h2 id="批量读取旧StoredFields中的chunk写到新的StoredFields中"><a href="#批量读取旧StoredFields中的chunk写到新的StoredFields中" class="headerlink" title="批量读取旧StoredFields中的chunk写到新的StoredFields中"></a>批量读取旧StoredFields中的chunk写到新的StoredFields中</h2><figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br></pre></td><td class="code"><pre><span class="line">//检验fdx的正确性</span><br><span class="line"> matchingFieldsReader.checkIntegrity(); </span><br><span class="line"> // flush any pending chunks</span><br><span class="line"> // 如果还有未写入的段</span><br><span class="line"> if (numBufferedDocs > 0) { </span><br><span class="line"> flush();</span><br><span class="line"> numDirtyChunks++; // incomplete: we had to force this flush</span><br><span class="line"> }</span><br><span class="line"> // fdt文件</span><br><span class="line"> IndexInput rawDocs = matchingFieldsReader.getFieldsStream(); </span><br><span class="line"> // FieldsIndexReader</span><br><span class="line"> FieldsIndex index = matchingFieldsReader.getIndexReader(); </span><br><span class="line"> // 首先从第0个chunk开始找</span><br><span class="line"> rawDocs.seek(index.getStartPointer(0));</span><br><span class="line"> int docID = 0; // 这个segment目前的文档数</span><br><span class="line"> // 每循环一次,读取一个chunk</span><br><span class="line"> while (docID < maxDoc) {</span><br><span class="line"> // read header</span><br><span class="line"> int base = rawDocs.readVInt();</span><br><span class="line"> int code = rawDocs.readVInt();</span><br><span class="line"> // write a new index entry and new header for this chunk.</span><br><span class="line"> int bufferedDocs = code >>> 1;</span><br><span class="line"> // 向新的dft写入</span><br><span class="line"> indexWriter.writeIndex(bufferedDocs, fieldsStream.getFilePointer());</span><br><span class="line"> fieldsStream.writeVInt(docBase); // rebase</span><br><span class="line"> fieldsStream.writeVInt(code);</span><br><span class="line"> docID += bufferedDocs;</span><br><span class="line"> docBase += bufferedDocs;</span><br><span class="line"> docCount += bufferedDocs;</span><br><span class="line"> if (docID == maxDoc) {</span><br><span class="line"> end = matchingFieldsReader.getMaxPointer();</span><br><span class="line"> } else {</span><br><span class="line"> // 下个chunk的起始位置</span><br><span class="line"> end = index.getStartPointer(docID); </span><br><span class="line"> } // 从原始fdt中批量拷贝数据,写入新的fdt中</span><br><span class="line"> fieldsStream.copyBytes(rawDocs, end - rawDocs.getFilePointer());</span><br><span class="line"> } //是拷贝fdt中每个chunK的后三项(numStoredFields, Doclength BufferDoc)</span><br><span class="line"> // since we bulk merged all chunks, we inherit any dirty ones from this segment.</span><br><span class="line"> numChunks += matchingFieldsReader.getNumChunks();</span><br><span class="line"> numDirtyChunks += matchingFieldsReader.getNumDirtyChunks();</span><br></pre></td></tr></table></figure>
<p>主要做了如下步骤:<br>1.首先检查是否还有未刷新的文档,若有的话,则调用flush进行刷新。<br>2.调用<code>index.getStartPointer(0)</code>查询到第0个文档中dft的起始位置,然后调用ChecksumIndexInput.seek()跳转到文件指定位置。<br>3.轮循这个Segment每个chunk, 依次读取docBase(用来更新全局的docBase,旧的docBase将废弃)、bufferedDocs、本chunk在fdt文件中的终点。最终会copy chunk后三部分内容。</p>
<h3 id="找到docId在fdt存储的起始位置"><a href="#找到docId在fdt存储的起始位置" class="headerlink" title="找到docId在fdt存储的起始位置"></a>找到docId在fdt存储的起始位置</h3><p>首先我们详细介绍下<code>FieldsIndexReader.getStartPointer(docId)</code>如何从fdm中将文档docId所在chunk的存储地址(fdt中存储的起始位置)读取出来的</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre></td><td class="code"><pre><span class="line">long getStartPointer(int docID) {</span><br><span class="line"> FutureObjects.checkIndex(docID, maxDoc);</span><br><span class="line"> // 找到这个docId的文档在哪个chunk上</span><br><span class="line"> long blockIndex = docs.binarySearch(0, numChunks, docID); </span><br><span class="line"> // 找一个chunk内的docId</span><br><span class="line"> if (blockIndex < 0) {</span><br><span class="line"> blockIndex = -2 - blockIndex;</span><br><span class="line"> }</span><br><span class="line"> return startPointers.get(blockIndex); // DirectMonotonicReader</span><br><span class="line"> }</span><br></pre></td></tr></table></figure>
<p>该函数主要做了三个操作:<br>1.首选确定这个docId一定在这segment中存在,那么第2步骤一定可以查询到这个docId存在某个chunk上。<br>2.调用<code>DirectMonotonicReader.binarySearch</code>进行二分搜索这个docId落在了哪个chunk上, 退出该函数的条件有两个:<br>+.正好命中原始累加值, 那么就直接返回该id所在chunkId<br>+.low=hig+1, 且id在当前范围内, 则返回 -1 - low(那么最外层-2-blockIndex=low-1=high), 这时,该docId一定在第low-1个chunk上<br>3.调用startPointers(),解析fdm文件的filepointer部分,获取这个chunk在fdt上存储的起始位置。<br>后面会分别详细展开第2、3步骤。<br>我们首先看下<code>DirectMonotonicReader.binarySearch</code>二叉搜索过程:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br></pre></td><td class="code"><pre><span class="line">public long binarySearch(long fromIndex, long toIndex, long key) {</span><br><span class="line"> if (fromIndex < 0 || fromIndex > toIndex) {</span><br><span class="line"> throw new IllegalArgumentException("fromIndex=" + fromIndex + ",toIndex=" + toIndex);</span><br><span class="line"> }</span><br><span class="line"> long lo = fromIndex;</span><br><span class="line"> long hi = toIndex - 1;</span><br><span class="line"> while (lo <= hi) {</span><br><span class="line"> final long mid = (lo + hi) >>> 1;</span><br><span class="line"> // Try to run as many iterations of the binary search as possible without</span><br><span class="line"> // hitting the direct readers, since they might hit a page fault.</span><br><span class="line"> final long[] bounds = getBounds(mid);</span><br><span class="line"> if (bounds[1] < key) {</span><br><span class="line"> lo = mid + 1;</span><br><span class="line"> } else if (bounds[0] > key) {</span><br><span class="line"> hi = mid - 1;</span><br><span class="line"> } else {</span><br><span class="line"> final long midVal = get(mid);</span><br><span class="line"> if (midVal < key) {</span><br><span class="line"> lo = mid + 1;</span><br><span class="line"> } else if (midVal > key) {</span><br><span class="line"> hi = mid - 1;</span><br><span class="line"> } else {</span><br><span class="line"> return mid;</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> return -1 - lo;</span><br><span class="line"> }</span><br></pre></td></tr></table></figure>
<p>我们首先需要知道DirectMonotonicWriter的存储方式:比如我们需要存储3,2,6,2,6(每个数字表示每个chunk的文档个数), 那么在存储的时候,实际DirectMonotonicReader存储的数字是3,5,11,13,19,使之变成了递增数组。此函数就是为了找key=docId的文档在哪个chunk中。在递增数组中可以通过二分查找来确定。<br>我们需要看下两个细节:<code>getBounds(mid)</code>确定这个chunk的文档id的范围,<code>get(mid)</code>确定这个chunk的起始位置。为了更清晰帮助用户理解,先讲解下<code>DirectMonotonicWriter.flush</code>是如何存储数据的:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br></pre></td><td class="code"><pre><span class="line">// 在storedField写入时,buffer里面每个元素,都是一个chunk</span><br><span class="line">private void flush() throws IOException { </span><br><span class="line"> assert bufferSize != 0;</span><br><span class="line"> // 首先计算这组数据的梯度差</span><br><span class="line"> final float avgInc = (float) ((double) (buffer[bufferSize-1] - buffer[0]) / Math.max(1, bufferSize - 1));</span><br><span class="line"> for (int i = 0; i < bufferSize; ++i) {</span><br><span class="line"> // 期望值从0开始</span><br><span class="line"> final long expected = (long) (avgInc * (long) i); </span><br><span class="line"> //实际值-期望值</span><br><span class="line"> buffer[i] -= expected;</span><br><span class="line"> }</span><br><span class="line"> // 找出最小的值</span><br><span class="line"> long min = buffer[0];</span><br><span class="line"> for (int i = 1; i < bufferSize; ++i) {</span><br><span class="line"> min = Math.min(buffer[i], min);</span><br><span class="line"> }</span><br><span class="line"> // 可能的最大值</span><br><span class="line"> long maxDelta = 0; </span><br><span class="line"> for (int i = 0; i < bufferSize; ++i) {</span><br><span class="line"> buffer[i] -= min;//比最小值大多少</span><br><span class="line"> // use | will change nothing when it comes to computing required bits</span><br><span class="line"> // but has the benefit of working fine with negative values too</span><br><span class="line"> // (in case of overflow)</span><br><span class="line"> maxDelta |= buffer[i];</span><br><span class="line"> }</span><br><span class="line"> //第一步,先存储chunk个数与理论平均值差距的最小值</span><br><span class="line"> meta.writeLong(min); </span><br><span class="line"> // 第二步,存储理论平均值</span><br><span class="line"> meta.writeInt(Float.floatToIntBits(avgInc)); </span><br><span class="line"> // 在fdx中的起始位置</span><br><span class="line"> meta.writeLong(data.getFilePointer() - baseDataPointer); </span><br><span class="line"> if (maxDelta == 0) {</span><br><span class="line"> meta.writeByte((byte) 0);</span><br><span class="line"> } else {</span><br><span class="line"> // 需要几位</span><br><span class="line"> final int bitsRequired = DirectWriter.unsignedBitsRequired(maxDelta);</span><br><span class="line"> DirectWriter writer = DirectWriter.getInstance(data, bufferSize, bitsRequired);</span><br><span class="line"> for (int i = 0; i < bufferSize; ++i) {</span><br><span class="line"> // 每个chunk与理论平均值之间的差距</span><br><span class="line"> writer.add(buffer[i]); </span><br><span class="line"> }</span><br><span class="line"> writer.finish();</span><br><span class="line"> // 写入需要的byte个数</span><br><span class="line"> meta.writeByte((byte) bitsRequired); </span><br><span class="line"> }</span><br><span class="line"> bufferSize = 0;</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>仍然以上面的数组示例来说明存储了数据到底是啥:<br>原始数: 3 2 6 2 6<br>写入前: 3 5 11 13 19 //原始数据累加值,->avgInc:(19-3)/4 = 4<br>梯度值: 0 4 8 12 16<br>变换1: 3 1 3 1 3 //写入前-梯度值, ->最小值min: 1<br>变换2: 2 0 2 0 2 //变换1-min,->maxDelta =max(变换2)<br>bound:(1,4) (5,8) (9,12) (13,16) (17,20)<br>1.若还原第1个(下标从0开始)元素写入前的原始日志:1+4x1+0=5,也就是 <code>min+avg*index+变换2</code>,这个就是DirectMonotonicReader.get()的实现:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line">//index是下标(一个元素就是一个chunk个数),返回的是这个chunk的文档个数</span><br><span class="line">public long get(long index) { </span><br><span class="line"> // 在第几个block上</span><br><span class="line"> final int block = (int) (index >>> blockShift); </span><br><span class="line"> // 这个block内第几个chunk</span><br><span class="line"> final long blockIndex = index & ((1 << blockShift) - 1);</span><br><span class="line"> // 从fdx中读偏移量 </span><br><span class="line"> final long delta = readers[block].get(blockIndex); </span><br><span class="line"> // 返回这chunk对应的数字</span><br><span class="line"> return mins[block] + (long) (avgs[block] * blockIndex) + delta; </span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>2.若我们想估算第i个元素(一个元素就是一个chunk的个数)的取值范围:<br>第i个元素=mins[block] + (long) (avgs[block] * blockIndex) + delta,那么下列等式将成立:<br>lowerBound = mins[block] + (long) (avgs[block] * blockIndex)<br>upperBound = lowerBound + maxDelta = lowerBound + (1 << bitsRequired -1)<br>这个也就是<code>DirectMonotonicReader.getBounds(mid)</code>的定义:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br></pre></td><td class="code"><pre><span class="line"> // index=第i个chunk</span><br><span class="line">private long[] getBounds(long index) {</span><br><span class="line"> // 这个chunk在第几个block上上</span><br><span class="line"> final int block = Math.toIntExact(index >>> blockShift); </span><br><span class="line"> // 落到某个block内的chunk数</span><br><span class="line"> final long blockIndex = index & ((1 << blockShift) - 1); </span><br><span class="line"> // 这个chunk的下限</span><br><span class="line"> final long lowerBound = mins[block] + (long) (avgs[block] * blockIndex);</span><br><span class="line"> // 这个chunk的上限 </span><br><span class="line"> final long upperBound = lowerBound + (1L << bpvs[block]) - 1;</span><br><span class="line"> if (bpvs[block] == 64 || upperBound < lowerBound) {</span><br><span class="line"> return new long[] { Long.MIN_VALUE, Long.MAX_VALUE };</span><br><span class="line"> } else {</span><br><span class="line"> return new long[] { lowerBound, upperBound };</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<h3 id="从fdt中循环读取每个chunk"><a href="#从fdt中循环读取每个chunk" class="headerlink" title="从fdt中循环读取每个chunk"></a>从fdt中循环读取每个chunk</h3><p>当前已知chunkId编号,则直接从fdm中该chunk的filePointer部分,根据前面提到的<code>DirectMonotonicReader.get()</code>取出该doc所在chunk在fdt中的存储位置。然后进入BlockState.reset(int docID)加载具体的chunk内容:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br><span class="line">80</span><br><span class="line">81</span><br><span class="line">82</span><br><span class="line">83</span><br><span class="line">84</span><br><span class="line">85</span><br><span class="line">86</span><br><span class="line">87</span><br><span class="line">88</span><br></pre></td><td class="code"><pre><span class="line">void reset(int docID) throws IOException {</span><br><span class="line"> boolean success = false;</span><br><span class="line"> try {</span><br><span class="line"> //读取这个docID所在chunk的所有原始数据</span><br><span class="line"> doReset(docID); </span><br><span class="line"> success = true;</span><br><span class="line"> } finally {</span><br><span class="line"> if (success == false) {</span><br><span class="line"> // if the read failed, set chunkDocs to 0 so that it does not</span><br><span class="line"> // contain any docs anymore and is not reused. This should help</span><br><span class="line"> // get consistent exceptions when trying to get several</span><br><span class="line"> // documents which are in the same corrupted block since it will</span><br><span class="line"> // force the header to be decoded again</span><br><span class="line"> chunkDocs = 0;</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br><span class="line">private void doReset(int docID) throws IOException {</span><br><span class="line"> // 获取这个docID所在的chnk在fdt中的起始位置</span><br><span class="line"> docBase = fieldsStream.readVInt(); </span><br><span class="line"> final int token = fieldsStream.readVInt();</span><br><span class="line"> chunkDocs = token >>> 1;</span><br><span class="line"> // 是否压缩了</span><br><span class="line"> sliced = (token & 1) != 0;</span><br><span class="line"> offsets = ArrayUtil.grow(offsets, chunkDocs + 1);</span><br><span class="line"> numStoredFields = ArrayUtil.grow(numStoredFields, chunkDocs);</span><br><span class="line"> if (chunkDocs == 1) {</span><br><span class="line"> numStoredFields[0] = fieldsStream.readVInt();</span><br><span class="line"> offsets[1] = fieldsStream.readVInt();</span><br><span class="line"> } else {</span><br><span class="line"> // Number of stored fields per document</span><br><span class="line"> final int bitsPerStoredFields = fieldsStream.readVInt();</span><br><span class="line"> // 首先读取numDoc个numStoredFields</span><br><span class="line"> if (bitsPerStoredFields == 0) { </span><br><span class="line"> Arrays.fill(numStoredFields, 0, chunkDocs, fieldsStream.readVInt());</span><br><span class="line"> } else {</span><br><span class="line"> final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields, 1);</span><br><span class="line"> for (int i = 0; i < chunkDocs; ++i) {</span><br><span class="line"> numStoredFields[i] = (int) it.next();</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> // The stream encodes the length of each document and we decode</span><br><span class="line"> // it into a list of monotonically increasing offsets 其次读取numDoc个docLength</span><br><span class="line"> final int bitsPerLength = fieldsStream.readVInt();</span><br><span class="line"> // 长度一致</span><br><span class="line"> if (bitsPerLength == 0) { </span><br><span class="line"> final int length = fieldsStream.readVInt();</span><br><span class="line"> for (int i = 0; i < chunkDocs; ++i) {</span><br><span class="line"> offsets[1 + i] = (1 + i) * length; //</span><br><span class="line"> }</span><br><span class="line"> } else {</span><br><span class="line"> // 一般都跑到这里,计算出这个doc在fdt中的偏移量。</span><br><span class="line"> final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1);</span><br><span class="line"> for (int i = 0; i < chunkDocs; ++i) {</span><br><span class="line"> offsets[i + 1] = (int) it.next();</span><br><span class="line"> }</span><br><span class="line"> for (int i = 0; i < chunkDocs; ++i) {</span><br><span class="line"> // 在fdt中每个doc的偏移量</span><br><span class="line"> offsets[i + 1] += offsets[i]; </span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> // Additional validation: only the empty document has a serialized length of 0</span><br><span class="line"> for (int i = 0; i < chunkDocs; ++i) {</span><br><span class="line"> final int len = offsets[i + 1] - offsets[i];</span><br><span class="line"> final int storedFields = numStoredFields[i];</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> // 读取当前读到的地方</span><br><span class="line"> startPointer = fieldsStream.getFilePointer();</span><br><span class="line"> // 读取真正的doc 原始压缩数据</span><br><span class="line"> if (merging) {</span><br><span class="line"> final int totalLength = offsets[chunkDocs];</span><br><span class="line"> // decompress eagerly</span><br><span class="line"> if (sliced) {</span><br><span class="line"> bytes.offset = bytes.length = 0;</span><br><span class="line"> for (int decompressed = 0; decompressed < totalLength; ) {</span><br><span class="line"> final int toDecompress = Math.min(totalLength - decompressed, chunkSize);</span><br><span class="line"> decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, spare);</span><br><span class="line"> bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + spare.length);</span><br><span class="line"> System.arraycopy(spare.bytes, spare.offset, bytes.bytes, bytes.length, spare.length);</span><br><span class="line"> bytes.length += spare.length;</span><br><span class="line"> decompressed += toDecompress;</span><br><span class="line"> }</span><br><span class="line"> } else {</span><br><span class="line"> decompressor.decompress(fieldsStream, totalLength, 0, totalLength, bytes);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>读取每个chunk具体的过程,和写chunk过程是一一对应的,要是感兴趣的话,可以看下<code>CompressingStoredFieldsWriter.flush</code>过程,这里就不再赘述。</p>
<h2 id="有删除或者脏chunk占比过大"><a href="#有删除或者脏chunk占比过大" class="headerlink" title="有删除或者脏chunk占比过大"></a>有删除或者脏chunk占比过大</h2><p>针对以下两种情况:对有文档删除的segment,我们需要在新的Segment中踢掉已经处于删除状态的文档;脏chunk占比过大,会影响压缩效率,我们也需要重建Segment, 基于此只能读取旧StoredFields中的每个文档,重新逐条写入新的的Segments中。具体操作如下:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br></pre></td><td class="code"><pre><span class="line">for (int docID = 0; docID < maxDoc; docID++) {</span><br><span class="line"> // 这个节点需要被删除,那么直接忽略</span><br><span class="line"> if (liveDocs != null && liveDocs.get(docID) == false) { </span><br><span class="line"> continue;</span><br><span class="line"> }</span><br><span class="line"> SerializedDocument doc = matchingFieldsReader.document(docID);</span><br><span class="line"> startDocument();</span><br><span class="line"> // 读取出来</span><br><span class="line"> bufferedDocs.copyBytes(doc.in, doc.length);</span><br><span class="line"> numStoredFieldsInDoc = doc.numStoredFields;</span><br><span class="line"> finishDocument();</span><br><span class="line"> ++docCount;</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>由于需要重新组织新的chunk结构,那么就需要每个live文档从旧StoredFields读取出来,写入新的chunk中。会轮循每个文档,每个文档都做了两个事情:<br>1.检查该文档所在的chunk是否已经加载到内存中,若没有加载到的话,那么调用<code>BlockState.reset(int docID)</code>加载chunk,然后放入内存中。<br>2.从内存中读取这个文档的value,写入即将创建的chunk中,此时新写入缓存的文档,将在最后的finish统一刷入文件形成一个新的chunk。</p>
<h1 id="总结"><a href="#总结" class="headerlink" title="总结"></a>总结</h1><p>StoredFields合并过程主要是循环每个segment,针对每个segment都从文档中读取出来,写入新的chunk中。读取每个segment过程分为两种;一种是批量copy整个chunk的值,写入新的chunk,然后再copy下个chunk,一种方式是读取该segment中每个文档,然后写入新的chunk,当然批量copy整个chunk的效率更高。选取哪种方式读取,取决于这个sement是否存在删除,或者存在dirtyChunk过多的情况。</p>
</div>
<div class="article-info article-info-index">
<div class="article-category tagcloud">
<i class="icon-book icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="/elasticsearch_learning/categories/Lucene//" class="article-tag-list-link color2">Lucene</a>
</li>
</ul>
</div>
<p class="article-more-link">
<a class="article-more-a" href="/elasticsearch_learning/2021/06/13/Lucene8-6-2底层架构-Segment-StoredFields合并原理详解/">展开全文 >></a>
</p>
<div class="clearfix"></div>
</div>
</div>
</article>
<aside class="wrap-side-operation">
<div class="mod-side-operation">
<div class="jump-container" id="js-jump-container" style="display:none;">
<a href="javascript:void(0)" class="mod-side-operation__jump-to-top">
<i class="icon-font icon-back"></i>
</a>
<div id="js-jump-plan-container" class="jump-plan-container" style="top: -11px;">
<i class="icon-font icon-plane jump-plane"></i>
</div>
</div>
</div>
</aside>
<article id="post-Lucene8-6-2底层架构-Point查询过程" class="article article-type-post article-index" itemscope itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/elasticsearch_learning/2021/01/03/Lucene8-6-2底层架构-Point查询过程/">Lucene8.6.2底层架构-Point查询过程</a>
</h1>
<a href="/elasticsearch_learning/2021/01/03/Lucene8-6-2底层架构-Point查询过程/" class="archive-article-date">
<time datetime="2021-01-03T11:56:55.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2021-01-03</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
</div>
<div class="article-info article-info-index">
<div class="article-category tagcloud">
<i class="icon-book icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="/elasticsearch_learning/categories/Lucene//" class="article-tag-list-link color2">Lucene</a>
</li>
</ul>
</div>
<p class="article-more-link">
<a class="article-more-a" href="/elasticsearch_learning/2021/01/03/Lucene8-6-2底层架构-Point查询过程/">展开全文 >></a>
</p>
<div class="clearfix"></div>
</div>
</div>
</article>
<aside class="wrap-side-operation">
<div class="mod-side-operation">
<div class="jump-container" id="js-jump-container" style="display:none;">
<a href="javascript:void(0)" class="mod-side-operation__jump-to-top">
<i class="icon-font icon-back"></i>
</a>
<div id="js-jump-plan-container" class="jump-plan-container" style="top: -11px;">
<i class="icon-font icon-plane jump-plane"></i>
</div>
</div>
</div>
</aside>
<article id="post-Lucene8-6-2底层架构-BKW树构建过程" class="article article-type-post article-index" itemscope itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/elasticsearch_learning/2020/11/01/Lucene8-6-2底层架构-BKW树构建过程/">Lucene8.6.2底层架构-BKW树构建过程</a>
</h1>
<a href="/elasticsearch_learning/2020/11/01/Lucene8-6-2底层架构-BKW树构建过程/" class="archive-article-date">
<time datetime="2020-11-01T08:22:41.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2020-11-01</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
<p>针对数值型的倒排索引,Lecene从6.X引入了BKD树结构,BKD全称:Block K-Dimension Balanced Tree。在此之前,数值型查找和String结构一样,使用<a href="https://kkewwei.github.io/elasticsearch_learning/2020/02/25/Lucene8-2-0%E5%BA%95%E5%B1%82%E6%9E%B6%E6%9E%84-%E8%AF%8D%E5%85%B8fst%E5%8E%9F%E7%90%86%E8%A7%A3%E6%9E%90/">FST结构</a>)建立索引,FST结构针对精确匹配存在较大的优势,但是数值型很大部分使用场景为范围查找, BKD树就是解决这类使用场景的。若我们将多维简化为一维时,结构就是bst(二叉查找树)。</p>
<h1 id="数据放入内存中"><a href="#数据放入内存中" class="headerlink" title="数据放入内存中"></a>数据放入内存中</h1><p>BKD树支持多维范围,数值型包括int,float,point等, 这里就以int类型写入作为示例,将<code>age</code>建构为三维:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">Document document = new Document();</span><br><span class="line">document.add(new IntPoint("age", i, i*i, i%20));</span><br><span class="line">indexWriter.addDocument(document);</span><br></pre></td></tr></table></figure>
<p>IntPoint内部会将多维转变为一维数组,转变过程比较简单,比如int,将转变为长度为3*4=12的byte数组。真正开始在内存中建立索引结构是在<code>DefaultIndexingChain.indexPoint()</code>处:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br></pre></td><td class="code"><pre><span class="line">private void indexPoint(int docID, PerField fp, IndexableField field) {</span><br><span class="line"> fp.pointValuesWriter.addPackedValue(docID, field.binaryValue());</span><br><span class="line">}</span><br><span class="line">// Point有多个值的话,都会堆砌到一个BytesRef中</span><br><span class="line"> public void PointValuesWriter.addPackedValue(int docID, BytesRef value) { </span><br><span class="line"> bytes.append(value);</span><br><span class="line"> docIDs[numPoints] = docID; </span><br><span class="line"> if (docID != lastDocID) {</span><br><span class="line"> numDocs++;</span><br><span class="line"> lastDocID = docID;</span><br><span class="line"> }</span><br><span class="line"> numPoints++;</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>其中,使用ByteBlockPool弹性扩容的功能存储byte数组的value, docIDs记录的是第几个point对应的文档号。numPoints记录的是point的个数(一个point可以由多个域构成),因为存在一个字段,会存储多个point的情况。</p>
<h1 id="将内存中的结构flush到磁盘"><a href="#将内存中的结构flush到磁盘" class="headerlink" title="将内存中的结构flush到磁盘"></a>将内存中的结构flush到磁盘</h1><h2 id="构建kdd文件"><a href="#构建kdd文件" class="headerlink" title="构建kdd文件"></a>构建kdd文件</h2><p>flush到文件中指的是形成一个segment,触发条件有两个(同<a href="https://kkewwei.github.io/elasticsearch_learning/2019/10/29/Lucenec%E5%BA%95%E5%B1%82%E6%9E%B6%E6%9E%84-fdt-fdx%E6%9E%84%E5%BB%BA%E8%BF%87%E7%A8%8B/#%E5%88%B7%E5%88%B0fdx%E6%96%87%E4%BB%B6">fdx</a>,<a href="https://kkewwei.github.io/elasticsearch_learning/2019/11/15/Lucene%E5%BA%95%E5%B1%82%E6%9E%B6%E6%9E%84-dvm-dvm%E6%9E%84%E5%BB%BA%E8%BF%87%E7%A8%8B/#%E5%88%B7%E6%96%B0%E5%88%B0%E6%96%87%E4%BB%B6">dvm</a>、<a href="https://kkewwei.github.io/elasticsearch_learning/2020/02/28/Lucene8-2-0%E5%BA%95%E5%B1%82%E6%9E%B6%E6%9E%84-tim-tip%E8%AF%8D%E5%85%B8%E7%BB%93%E6%9E%84%E5%8E%9F%E7%90%86%E7%A0%94%E7%A9%B6/#flush%E5%88%B0%E6%96%87%E4%BB%B6%E4%B8%AD">词典建立</a>一样):<br>1.lucene建立的索引结构占用内存或者缓存文档数超过阈值。该check会在每次索引完一个文档后(详见<code>flushControl.doAfterDocument</code>)。<br>2.用户主动调用indexWriter.flush()触发。<br>刷新建立BKD树时,我们首先进入<code>PointValuesWriter.flush()</code>:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br></pre></td><td class="code"><pre><span class="line">public void flush(SegmentWriteState state, Sorter.DocMap sortMap, PointsWriter writer) throws IOException {</span><br><span class="line"> // 封装了读取point&文档的方法</span><br><span class="line"> PointValues points = new MutablePointValues() {</span><br><span class="line"> // 给每个point都编了号。若之后对该域每个point进行排序,也仅仅是对这个号排序。</span><br><span class="line"> final int[] ords = new int[numPoints]; </span><br><span class="line"> {</span><br><span class="line"> for (int i = 0; i < numPoints; ++i) {</span><br><span class="line"> ords[i] = i; </span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> // ords存放的是排序后的point编号,通过docIDs来查找真正的docId</span><br><span class="line"> @Override</span><br><span class="line"> public int getDocID(int i) { return docIDs[ords[i]]; }</span><br><span class="line"> // 读取第i个point的值</span><br><span class="line"> @Override</span><br><span class="line"> public void getValue(int i, BytesRef packedValue) {</span><br><span class="line"> // 这个数据的偏移量</span><br><span class="line"> final long offset = (long) packedBytesLength * ords[i]; </span><br><span class="line"> packedValue.length = packedBytesLength;</span><br><span class="line"> bytes.setRawBytesRef(packedValue, offset);</span><br><span class="line"> }</span><br><span class="line"> // 第i个point的第k位</span><br><span class="line"> @Override</span><br><span class="line"> public byte getByteAt(int i, int k) {</span><br><span class="line"> final long offset = (long) packedBytesLength * ords[i] + k;</span><br><span class="line"> // 从BytePool中读取offset</span><br><span class="line"> return bytes.readByte(offset);</span><br><span class="line"> }</span><br><span class="line"> };</span><br><span class="line"> final PointValues values = points; </span><br><span class="line"> writer.writeField(fieldInfo, reader); </span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>这里用MutablePointValues封装了读取每个point的方法,需要进入Lucene86PointsWriter.writeField看下。我们需要知道,Lucene86PointsWriter定义了BKD每个叶子存放的point不能超过512个(BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE),大小不能超过16MB(BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP):</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br></pre></td><td class="code"><pre><span class="line">@Override</span><br><span class="line">public void writeField(FieldInfo fieldInfo, PointsReader reader) throws IOException {</span><br><span class="line"> PointValues values = reader.getValues(fieldInfo.name);</span><br><span class="line"> try (BKDWriter writer = new BKDWriter(writeState.segmentInfo.maxDoc(),</span><br><span class="line"> writeState.directory,</span><br><span class="line"> writeState.segmentInfo.name,</span><br><span class="line"> fieldInfo.getPointDimensionCount(),</span><br><span class="line"> fieldInfo.getPointIndexDimensionCount(),</span><br><span class="line"> fieldInfo.getPointNumBytes(),</span><br><span class="line"> maxPointsInLeafNode,</span><br><span class="line"> maxMBSortInHeap,</span><br><span class="line"> values.size())) {</span><br><span class="line"></span><br><span class="line"> if (values instanceof MutablePointValues) {</span><br><span class="line"> Runnable finalizer = writer.writeField(metaOut, indexOut, dataOut, fieldInfo.name, (MutablePointValues) values);</span><br><span class="line"> if (finalizer != null) {</span><br><span class="line"> metaOut.writeInt(fieldInfo.number);</span><br><span class="line"> finalizer.run();</span><br><span class="line"> }</span><br><span class="line"> return;</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>BKDWriter函数就是构建BKD数的核心类, 需要继续进入BKDWriter.writeField->writeFieldNDims看如何构建的,我们以point为2+维进行介绍,一维的是其简化版:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br></pre></td><td class="code"><pre><span class="line"> private Runnable writeFieldNDims(IndexOutput metaOut, IndexOutput indexOut, IndexOutput dataOut, String fieldName, MutablePointValues values) throws IOException {</span><br><span class="line"> finished = true;</span><br><span class="line"></span><br><span class="line"> pointCount = values.size();</span><br><span class="line"> // 统计多少个叶子节点,一个叶子节点存放512个point</span><br><span class="line"> final int numLeaves = Math.toIntExact((pointCount + maxPointsInLeafNode - 1) / maxPointsInLeafNode);</span><br><span class="line"> final int numSplits = numLeaves - 1;</span><br><span class="line"></span><br><span class="line"> // 第1位放置当前级的Dim,第2部分防止分割时候的当前值。记录了每个NodeId都是从哪离开始切割的。完全二叉树,1,2,3</span><br><span class="line"> final byte[] splitPackedValues = new byte[numSplits * bytesPerDim];</span><br><span class="line"> final byte[] splitDimensionValues = new byte[numSplits];</span><br><span class="line"> // 每个叶子在kdd中开始存放的位置</span><br><span class="line"> final long[] leafBlockFPs = new long[numLeaves]; </span><br><span class="line"> // 获取每个维度的最大值与最小值</span><br><span class="line"> // compute the min/max for this slice</span><br><span class="line"> computePackedValueBounds(values, 0, Math.toIntExact(pointCount), minPackedValue, maxPackedValue, scratchBytesRef1);</span><br><span class="line"> for (int i = 0; i < Math.toIntExact(pointCount); ++i) {</span><br><span class="line"> docsSeen.set(values.getDocID(i));</span><br><span class="line"> }</span><br><span class="line"> // 开始构造BKD树</span><br><span class="line"> final long dataStartFP = dataOut.getFilePointer();</span><br><span class="line"> // 将统计每个维度拆分的次数,若存在某个维度切分次数不足最大的一半,那么本次将选择这个维度切分,以便尽量避免每个维度拆分次数差距过大,而导致查询毛刺</span><br><span class="line"> final int[] parentSplits = new int[numIndexDims]; </span><br><span class="line"> build(0, numLeaves, values, 0, Math.toIntExact(pointCount), dataOut,</span><br><span class="line"> minPackedValue.clone(), maxPackedValue.clone(), parentSplits,</span><br><span class="line"> splitPackedValues, splitDimensionValues, leafBlockFPs,</span><br><span class="line"> new int[maxPointsInLeafNode]);</span><br><span class="line"></span><br><span class="line"> scratchBytesRef1.length = bytesPerDim;</span><br><span class="line"> scratchBytesRef1.bytes = splitPackedValues;</span><br><span class="line"> BKDTreeLeafNodes leafNodes = new BKDTreeLeafNodes() {</span><br><span class="line"> @Override</span><br><span class="line"> public long getLeafLP(int index) {</span><br><span class="line"> return leafBlockFPs[index];</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> @Override</span><br><span class="line"> public BytesRef getSplitValue(int index) {</span><br><span class="line"> scratchBytesRef1.offset = index * bytesPerDim;</span><br><span class="line"> return scratchBytesRef1;</span><br><span class="line"> }</span><br><span class="line"> @Override</span><br><span class="line"> public int getSplitDimension(int index) {</span><br><span class="line"> return splitDimensionValues[index] & 0xff;</span><br><span class="line"> }</span><br><span class="line"> @Override</span><br><span class="line"> public int numLeaves() {</span><br><span class="line"> return leafBlockFPs.length;</span><br><span class="line"> }</span><br><span class="line"> };</span><br><span class="line"> return () -> {</span><br><span class="line"> try { // metaOut:kdm文件 indexOut:kdi文件</span><br><span class="line"> writeIndex(metaOut, indexOut, maxPointsInLeafNode, leafNodes, dataStartFP); // 写dim文件</span><br><span class="line"> } catch (IOException e) {</span><br><span class="line"> throw new UncheckedIOException(e);</span><br><span class="line"> }</span><br><span class="line"> };</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>该函数主要做了如下事情:<br>1.计算了该BDK的叶子数<br>2.首先统计每个维度的最大值&最小值, 以便决定是从哪个维度开始切分排序<br>3.进入<code>BKDWriter.build()</code>函数开始递归构建每个维度。原则上,对于当前所有point,首先按照512个point为1个节点,放入完全二叉树的叶子中。按照从跟节点从上向下切分所有的叶子节点,切分前查找每个维度的最大最小差值,以这个维度将切分的左右子树保持局部有序。<br>4.将构建好的BKD树元数据存放在kdi和kdm文件中.</p>
<p><code>BKDWriter.build</code>将切分过程&局部有序分成两个阶段:<br>1.切分到叶子节点后也保证叶子内某个维度有序。<br>2.当没切分到叶子节点时,保证左右子树局部有序。</p>
<h3 id="当不是叶子节点时,需要split,保证某一维度有序"><a href="#当不是叶子节点时,需要split,保证某一维度有序" class="headerlink" title="当不是叶子节点时,需要split,保证某一维度有序"></a>当不是叶子节点时,需要split,保证某一维度有序</h3><p>这里看下如何还不是叶子节点时的切分过程:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br></pre></td><td class="code"><pre><span class="line"> final int splitDim;</span><br><span class="line"> // compute the split dimension and partition around it</span><br><span class="line"> // 只有一个维度</span><br><span class="line"> if (numIndexDims == 1) { </span><br><span class="line"> splitDim = 0;</span><br><span class="line"> // 至少两个维度的数据</span><br><span class="line"> } else { </span><br><span class="line"> // for dimensions > 2 we recompute the bounds for the current inner node to help the algorithm choose best</span><br><span class="line"> // split dimensions. Because it is an expensive operation, the frequency we recompute the bounds is given</span><br><span class="line"> // by SPLITS_BEFORE_EXACT_BOUNDS.</span><br><span class="line"> // 大于2个维度的话,为了最合适的拆分,需要重新找下最大值和最小值</span><br><span class="line"> if (numLeaves != leafBlockFPs.length && numIndexDims > 2 && Arrays.stream(parentSplits).sum() % SPLITS_BEFORE_EXACT_BOUNDS == 0) {</span><br><span class="line"> computePackedValueBounds(reader, from, to, minPackedValue, maxPackedValue, scratchBytesRef1);</span><br><span class="line"> }</span><br><span class="line"> // 查找每个维度最大值和最小值差值的最大的那个维度,决定以这个维度开始拆分</span><br><span class="line"> splitDim = split(minPackedValue, maxPackedValue, parentSplits);</span><br><span class="line"> }</span><br><span class="line"> // 左子树叶子节点</span><br><span class="line"> // How many leaves will be in the left tree:</span><br><span class="line"> int numLeftLeafNodes = getNumLeftLeafNodes(numLeaves);</span><br><span class="line"> // How many points will be in the left tree:</span><br><span class="line"> final int mid = from + numLeftLeafNodes * maxPointsInLeafNode; // 那么重点节点的index编号</span><br><span class="line"> // 确定最大值和最小值相同的前缀长度</span><br><span class="line"> int commonPrefixLen = FutureArrays.mismatch(minPackedValue, splitDim * bytesPerDim,</span><br><span class="line"> splitDim * bytesPerDim + bytesPerDim, maxPackedValue, splitDim * bytesPerDim,</span><br><span class="line"> splitDim * bytesPerDim + bytesPerDim);</span><br><span class="line"> if (commonPrefixLen == -1) {</span><br><span class="line"> commonPrefixLen = bytesPerDim;</span><br><span class="line"> }</span><br><span class="line"> // 通过基数排序+快排实现了排序,保证中间数左右有序</span><br><span class="line"> MutablePointsReaderUtils.partition(numDataDims, numIndexDims, maxDoc, splitDim, bytesPerDim, commonPrefixLen,</span><br><span class="line"> reader, from, to, mid, scratchBytesRef1, scratchBytesRef2);</span><br><span class="line"></span><br><span class="line"> final int rightOffset = leavesOffset + numLeftLeafNodes;</span><br><span class="line"> // 拆分时那个节点的偏移量</span><br><span class="line"> final int splitOffset = rightOffset - 1;</span><br><span class="line"> // set the split value</span><br><span class="line"> final int address = splitOffset * bytesPerDim;</span><br><span class="line"> // 以哪个节点哪个维度开始切分</span><br><span class="line"> splitDimensionValues[splitOffset] = (byte) splitDim;</span><br><span class="line"> </span><br><span class="line"> reader.getValue(mid, scratchBytesRef1);</span><br><span class="line"> System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, splitPackedValues, address, bytesPerDim);</span><br><span class="line"></span><br><span class="line"> byte[] minSplitPackedValue = ArrayUtil.copyOfSubArray(minPackedValue, 0, packedIndexBytesLength); //从minPackedValue中copy一份最小值</span><br><span class="line"> byte[] maxSplitPackedValue = ArrayUtil.copyOfSubArray(maxPackedValue, 0, packedIndexBytesLength); //从maxPackedValue中copy一份最大值</span><br><span class="line"> //重新定义左子树的最小值</span><br><span class="line"> System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim,</span><br><span class="line"> minSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);</span><br><span class="line"> System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim,</span><br><span class="line"> maxSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);</span><br><span class="line"></span><br><span class="line"> // recurse</span><br><span class="line"> // 统计哪个维度被切分了</span><br><span class="line"> parentSplits[splitDim]++; </span><br><span class="line"> // 左中</span><br><span class="line"> build(leavesOffset, numLeftLeafNodes, reader, from, mid, out,</span><br><span class="line"> minPackedValue, maxSplitPackedValue, parentSplits,</span><br><span class="line"> splitPackedValues, splitDimensionValues, leafBlockFPs, spareDocIds);</span><br><span class="line"> // 中又</span><br><span class="line"> build(rightOffset, numLeaves - numLeftLeafNodes, reader, mid, to, out,</span><br><span class="line"> minSplitPackedValue, maxPackedValue, parentSplits,</span><br><span class="line"> splitPackedValues, splitDimensionValues, leafBlockFPs, spareDocIds);</span><br><span class="line"> parentSplits[splitDim]--;</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>根据当前数据进行排序切分,主要做了如下事情:<br>1.判断是否需要重新(精确)统计from->to所有维度最大值、最小值,仅当父类节点在当前维度每切分4次(SPLITS_BEFORE_EXACT_BOUNDS)才统计,因为精确统计所有point的边界是一个非常昂贵的操作。<br>2.在<code>BKDWriter.split()</code>根据最大值最小值边界差距最大的那个维度确定以哪个维度排序。同时为了考虑每个维度被选中切分排序的次数不能差距太大,规定了,每个维度切分排序次数不能相差2倍,若切分次数太少了,会强制选择切分次数最小的那个维度。<br>3.通过构建完全二叉树来计算左子树、右子树分别有多少个叶子节点。<br>4.确定在被切分的维度上,最大值和最小值有多少个相同的前缀,以便压缩存储。<br>5.使用<code>MutablePointsReaderUtils.partition</code>来进行排序。保证当前维度下,左子树的值小于右子树。<br>6.重新恢复左子树的所有维度的最大值:<code>maxSplitPackedValue</code>和右子树的所有维度的最小值:<code>minSplitPackedValue</code><br>7.分别递归左子树和右子树,再继续查找分隔维度并在此维度上进行排序。最终形成了每个子树只在一个维度保证了左右有序。切分过程如下:<br><img src="https://kkewwei.github.io/elasticsearch_learning/img/bkd1.png" height="350" width="350"></p>
<p>我们再看下<code>MutablePointsReaderUtils.partition</code>如何在splitDim维度上保证左边的数值小于等于mid对应的值、右边的数大于等于mid对应的值。这里其实使用了堆排&快排的思想完成的排序。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br></pre></td><td class="code"><pre><span class="line">public static void partition(int numDataDim, int numIndexDim, int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen,</span><br><span class="line"> MutablePointValues reader, int from, int to, int mid,</span><br><span class="line"> BytesRef scratch1, BytesRef scratch2) {</span><br><span class="line"> // 这个point内的偏移量:该维度</span><br><span class="line"> final int dimOffset = splitDim * bytesPerDim + commonPrefixLen; </span><br><span class="line"> // 需要比较的位数</span><br><span class="line"> final int dimCmpBytes = bytesPerDim - commonPrefixLen; </span><br><span class="line"> // 整个point的数据结尾位置</span><br><span class="line"> final int dataOffset = numIndexDim * bytesPerDim; </span><br><span class="line"> // 该point该point不相同的个数</span><br><span class="line"> final int dataCmpBytes = (numDataDim - numIndexDim) * bytesPerDim + dimCmpBytes; </span><br><span class="line"> final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);</span><br><span class="line"> // 这里位数为两类,可以从byteAt()看出,读取每一类的方式也不一样</span><br><span class="line"> new RadixSelector(dataCmpBytes + (bitsPerDocId + 7) / 8) {</span><br><span class="line"> // 第一类就是普通的dimCmpBytes,读取的是不相同的字符;第二类是 (bitsPerDocId + 7) / 8, 读取的是文档Id,就是说把文档Id作为了桶的一部分</span><br><span class="line"> @Override</span><br><span class="line"> // 使用快排进行排序</span><br><span class="line"> protected Selector getFallbackSelector(int k) { </span><br><span class="line"> final int dataStart = (k < dimCmpBytes) ? dataOffset : dataOffset + k - dimCmpBytes;</span><br><span class="line"> final int dataEnd = numDataDim * bytesPerDim;</span><br><span class="line"> return new IntroSelector() {</span><br><span class="line"></span><br><span class="line"> final BytesRef pivot = scratch1;</span><br><span class="line"> int pivotDoc;</span><br><span class="line"></span><br><span class="line"> @Override</span><br><span class="line"> protected void swap(int i, int j) {</span><br><span class="line"> reader.swap(i, j);</span><br><span class="line"> }</span><br><span class="line"> @Override</span><br><span class="line"> protected void setPivot(int i) {</span><br><span class="line"> reader.getValue(i, pivot);</span><br><span class="line"> pivotDoc = reader.getDocID(i);</span><br><span class="line"> }</span><br><span class="line"> @Override</span><br><span class="line"> protected int comparePivot(int j) {</span><br><span class="line"> if (k < dimCmpBytes) {</span><br><span class="line"> reader.getValue(j, scratch2);</span><br><span class="line"> int cmp = FutureArrays.compareUnsigned(pivot.bytes, pivot.offset + dimOffset + k, pivot.offset + dimOffset + dimCmpBytes,</span><br><span class="line"> scratch2.bytes, scratch2.offset + dimOffset + k, scratch2.offset + dimOffset + dimCmpBytes);</span><br><span class="line"> if (cmp != 0) {</span><br><span class="line"> return cmp;</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> if (k < dataCmpBytes) {</span><br><span class="line"> reader.getValue(j, scratch2);</span><br><span class="line"> int cmp = FutureArrays.compareUnsigned(pivot.bytes, pivot.offset + dataStart, pivot.offset + dataEnd,</span><br><span class="line"> scratch2.bytes, scratch2.offset + dataStart, scratch2.offset + dataEnd);</span><br><span class="line"> if (cmp != 0) {</span><br><span class="line"> return cmp;</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> // 通过文档大小相比较</span><br><span class="line"> return pivotDoc - reader.getDocID(j); </span><br><span class="line"> }</span><br><span class="line"> };</span><br><span class="line"> }</span><br><span class="line"> @Override</span><br><span class="line"> protected void swap(int i, int j) {</span><br><span class="line"> reader.swap(i, j);</span><br><span class="line"> }</span><br><span class="line"> // 可以从maxLength=dataCmpBytes + (bitsPerDocId + 7) / 8可以看出,属于不同的读法,</span><br><span class="line"> @Override</span><br><span class="line"> // 第i个point,第k个byte</span><br><span class="line"> protected int byteAt(int i, int k) {</span><br><span class="line"> // 读取的是dataCmpBytes中的数据</span><br><span class="line"> if (k < dimCmpBytes) { </span><br><span class="line"> return Byte.toUnsignedInt(reader.getByteAt(i, dimOffset + k));</span><br><span class="line"> } else if (k < dataCmpBytes) {</span><br><span class="line"> return Byte.toUnsignedInt(reader.getByteAt(i, dataOffset + k - dimCmpBytes));</span><br><span class="line"> } else {</span><br><span class="line"> // 读取的是docId的高位,通过(k - dataCmpBytes)去掉原本影响</span><br><span class="line"> final int shift = bitsPerDocId - ((k - dataCmpBytes + 1) << 3);</span><br><span class="line"> // 应该仅仅是为了hash,从docId中取值 </span><br><span class="line"> return (reader.getDocID(i) >>> Math.max(0, shift)) & 0xff; </span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> }.select(from, to, mid);</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>排序逻辑我们需要注意两个地方:<br>1.每个point当前切分维度splitDim的逻辑位数:dataCmpBytes + (bitsPerDocId + 7) / 8。我们当使用基数排序时,会确定每个point的位数,然后可以从高位->低位的排序。同样,这里对每个point的第splitDim维度产生虚拟位数。针对from-to的point的第splitDim维度,首先使用从不相同的byte进行排序,其次基于docId进行排序。<br>2.byteAt中定义了逻辑位数的获取方法,k表示逻辑元素的位数,当k小于该元素该维度的不同起始位时,获取具体的byte;当k超过不同位数dimCmpBytes时,则比较文档Id大小。<br>我们看下RadixSelector里最核心的排序算法部分:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br></pre></td><td class="code"><pre><span class="line">private void radixSelect(int from, int to, int k, int d, int l) {</span><br><span class="line"> //统计的是每一字符多少个数据</span><br><span class="line"> final int[] histogram = this.histogram;</span><br><span class="line"> // 每次使用都得清空</span><br><span class="line"> Arrays.fill(histogram, 0); </span><br><span class="line"></span><br><span class="line"> final int commonPrefixLength = computeCommonPrefixLengthAndBuildHistogram(from, to, d, histogram);</span><br><span class="line"> // 所有point中有相同的逻辑位数前缀</span><br><span class="line"> if (commonPrefixLength > 0) { </span><br><span class="line"> // if there are no more chars to compare or if all entries fell into the</span><br><span class="line"> // first bucket (which means strings are shorter than d) then we are done</span><br><span class="line"> // otherwise recurse</span><br><span class="line"> if (d + commonPrefixLength < maxLength</span><br><span class="line"> && histogram[0] < to - from) {</span><br><span class="line"> // 忽略过相同的前缀部分</span><br><span class="line"> radixSelect(from, to, k, d + commonPrefixLength, l); </span><br><span class="line"> }</span><br><span class="line"> return;</span><br><span class="line"> }</span><br><span class="line"> assert assertHistogram(commonPrefixLength, histogram);</span><br><span class="line"> // 所有point逻辑位数没有一个相同前缀</span><br><span class="line"> int bucketFrom = from;</span><br><span class="line"> // 遍历每一个桶</span><br><span class="line"> for (int bucket = 0; bucket < HISTOGRAM_SIZE; ++bucket) { </span><br><span class="line"> // 获取桶里面数据的范围</span><br><span class="line"> final int bucketTo = bucketFrom + histogram[bucket]; </span><br><span class="line"> // 若读取的数据个数超过中位数,</span><br><span class="line"> if (bucketTo > k) {</span><br><span class="line"> // 通过快排的思想,完成左边小于桶,右边大于桶</span><br><span class="line"> partition(from, to, bucket, bucketFrom, bucketTo, d);</span><br><span class="line"> // 最中间的那个桶继续排序</span><br><span class="line"> if (bucket != 0 && d + 1 < maxLength) {</span><br><span class="line"> // all elements in bucket 0 are equal so we only need to recurse if bucket != 0</span><br><span class="line"> select(bucketFrom, bucketTo, k, d + 1, l + 1);</span><br><span class="line"> }</span><br><span class="line"> return;</span><br><span class="line"> }</span><br><span class="line"> bucketFrom = bucketTo;</span><br><span class="line"> }</span><br><span class="line"> throw new AssertionError("Unreachable code");</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>该函数主要做了如下事情:<br>1.通过computeCommonPrefixLengthAndBuildHistogram统计第splitDim维度逻辑位数第d位相同的前缀commonPrefixLength,并且统计每个字母出现的次数histogram。<br><img src="https://kkewwei.github.io/elasticsearch_learning/img/bkd2.png" height="300" width="570"><br>2.若commonPrefixLength大于0,代表该维度逻辑位数为d位有相同的前缀。同时检测到还有逻辑位数没有排序完,那么直接跳到逻辑位数不同的位数继续进行排序。<br>3.否则,该维度逻辑位数为d位没有相同的前缀,那么就统计下,第k(初始化时为mid)个数逻辑位数d的值在哪个histogram维度内,然后调用<code>partition()</code>按照快排的思想将找出bucketTo-bucketFrom的值放在中间,这样,左边的值都小于中间的那组值,右边的值都大于中间的那组值。<br><img src="https://kkewwei.github.io/elasticsearch_learning/img/bkd3.png" height="350" width="450"><br>4.继续递归,完成第k个所在那档的point所在splitDim维度逻辑为数第d+1位有序,直到这档splitDim维度逻辑完全有序。<br>此时,所有point在splitDim维度维度,形成了相对排序:以k为分隔, 第k个point左边的所有point均小于等于k,第k个pint右边的所有point均大于等于k。</p>
<h3 id="当递归到叶子节点时,需要split"><a href="#当递归到叶子节点时,需要split" class="headerlink" title="当递归到叶子节点时,需要split"></a>当递归到叶子节点时,需要split</h3><p>此时叶子节点内的point并没有按照某一个维度有序。每个叶子的处理顺序比较有序,是从第一个叶子、第二个、第三个、、、、最后一个叶子的顺序进行的。这部分主要是将叶子节所有point点如何高效的存储起来。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br><span class="line">80</span><br><span class="line">81</span><br><span class="line">82</span><br><span class="line">83</span><br><span class="line">84</span><br><span class="line">85</span><br><span class="line">86</span><br><span class="line">87</span><br><span class="line">88</span><br><span class="line">89</span><br><span class="line">90</span><br><span class="line">91</span><br><span class="line">92</span><br><span class="line">93</span><br><span class="line">94</span><br><span class="line">95</span><br><span class="line">96</span><br><span class="line">97</span><br><span class="line">98</span><br><span class="line">99</span><br><span class="line">100</span><br><span class="line">101</span><br><span class="line">102</span><br><span class="line">103</span><br><span class="line">104</span><br><span class="line">105</span><br><span class="line">106</span><br><span class="line">107</span><br><span class="line">108</span><br><span class="line">109</span><br><span class="line">110</span><br><span class="line">111</span><br><span class="line">112</span><br><span class="line">113</span><br><span class="line">114</span><br><span class="line">115</span><br><span class="line">116</span><br><span class="line">117</span><br><span class="line">118</span><br></pre></td><td class="code"><pre><span class="line">// 叶子节点的起始位置</span><br><span class="line">final int count = to - from; </span><br><span class="line">//计算这批数据里面每个域相同的前缀</span><br><span class="line">// Compute common prefixes</span><br><span class="line">Arrays.fill(commonPrefixLengths, bytesPerDim);</span><br><span class="line">// 读取第from个point所有维度的值</span><br><span class="line">reader.getValue(from, scratchBytesRef1);</span><br><span class="line">//从下个开始,找相同的前缀 </span><br><span class="line">for (int i = from + 1; i < to; ++i) { </span><br><span class="line"> reader.getValue(i, scratchBytesRef2);</span><br><span class="line"> // 比较每个维度从from->to中相同的前缀</span><br><span class="line"> for (int dim=0;dim<numDataDims;dim++) { </span><br><span class="line"> final int offset = dim * bytesPerDim;</span><br><span class="line"> int dimensionPrefixLength = commonPrefixLengths[dim];</span><br><span class="line"> commonPrefixLengths[dim] = FutureArrays.mismatch(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, // 不同数据的起点</span><br><span class="line"> scratchBytesRef1.offset + offset + dimensionPrefixLength,</span><br><span class="line"> scratchBytesRef2.bytes, scratchBytesRef2.offset + offset,</span><br><span class="line"> scratchBytesRef2.offset + offset + dimensionPrefixLength);</span><br><span class="line"> if (commonPrefixLengths[dim] == -1) {</span><br><span class="line"> // 两个字符串一模一样,那么不修改相同前缀长度</span><br><span class="line"> commonPrefixLengths[dim] = dimensionPrefixLength;</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br><span class="line"></span><br><span class="line">// Find the dimension that has the least number of unique bytes at commonPrefixLengths[dim]</span><br><span class="line">FixedBitSet[] usedBytes = new FixedBitSet[numDataDims];</span><br><span class="line">for (int dim = 0; dim < numDataDims; ++dim) {</span><br><span class="line"> // 所有字符不一样长</span><br><span class="line"> if (commonPrefixLengths[dim] < bytesPerDim) {</span><br><span class="line"> //因为最多有128个字符,这里用256位就满足了.只有不一样的才会被赋值</span><br><span class="line"> usedBytes[dim] = new FixedBitSet(256); </span><br><span class="line"> }</span><br><span class="line">} // 统计不一样的那个维度,去重之后可以分为多少个字符</span><br><span class="line">for (int i = from + 1; i < to; ++i) {</span><br><span class="line"> for (int dim=0;dim<numDataDims;dim++) {</span><br><span class="line"> if (usedBytes[dim] != null) { // 该维度值不一样</span><br><span class="line"> byte b = reader.getByteAt(i, dim * bytesPerDim + commonPrefixLengths[dim]);</span><br><span class="line"> usedBytes[dim].set(Byte.toUnsignedInt(b));</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br><span class="line">// 统计两个维度中distinct字母最少的那个维度</span><br><span class="line">int sortedDim = 0; </span><br><span class="line"> // distinct后的值个数</span><br><span class="line">int sortedDimCardinality = Integer.MAX_VALUE;</span><br><span class="line">for (int dim = 0; dim < numDataDims; ++dim) {</span><br><span class="line"> if (usedBytes[dim] != null) {</span><br><span class="line"> final int cardinality = usedBytes[dim].cardinality();</span><br><span class="line"> if (cardinality < sortedDimCardinality) {</span><br><span class="line"> sortedDim = dim;</span><br><span class="line"> sortedDimCardinality = cardinality;</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">} // 将数据以最集中的那个维度排序</span><br><span class="line">// 每个维度都有一系列数据,这系列数据在某位开始不同,统计开始不同这位有多少个distinct个数据,找到这几个维度中,distinct最小的那个维度,进行排序</span><br><span class="line">// sort by sortedDim</span><br><span class="line">MutablePointsReaderUtils.sortByDim(numDataDims, numIndexDims, sortedDim, bytesPerDim, commonPrefixLengths,</span><br><span class="line"> reader, from, to, scratchBytesRef1, scratchBytesRef2);</span><br><span class="line"></span><br><span class="line">BytesRef comparator = scratchBytesRef1;</span><br><span class="line">BytesRef collector = scratchBytesRef2;</span><br><span class="line"> // 读取排序后的第一个point,被比较的数</span><br><span class="line">reader.getValue(from, comparator);</span><br><span class="line">// 获取的是point(全维度)与后面一个point不相同的个数</span><br><span class="line">int leafCardinality = 1; </span><br><span class="line">for (int i = from + 1; i < to; ++i) {</span><br><span class="line"> // 读取下一个point, collector是最新的数</span><br><span class="line"> reader.getValue(i, collector); </span><br><span class="line"> // 几个维度,只有前面一个和后面有一个不相同,就leafCardinality+1</span><br><span class="line"> for (int dim =0; dim < numDataDims; dim++) { </span><br><span class="line"> // 从不同之处开始比较 </span><br><span class="line"> final int start = dim * bytesPerDim + commonPrefixLengths[dim];</span><br><span class="line"> final int end = dim * bytesPerDim + bytesPerDim;</span><br><span class="line"> // 如果不是完全一样</span><br><span class="line"> if (FutureArrays.mismatch(collector.bytes, collector.offset + start, collector.offset + end,</span><br><span class="line"> comparator.bytes, comparator.offset + start, comparator.offset + end) != -1) {</span><br><span class="line"> // 每个value都不同</span><br><span class="line"> leafCardinality++; </span><br><span class="line"> // 在交换collector和comparator的值,是想前后比较是否一致</span><br><span class="line"> BytesRef scratch = collector;</span><br><span class="line"> collector = comparator;</span><br><span class="line"> comparator = scratch;</span><br><span class="line"> // 直接退出了, 所以交换没啥用</span><br><span class="line"> break; </span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br><span class="line">// Save the block file pointer:</span><br><span class="line">leafBlockFPs[leavesOffset] = out.getFilePointer(); // kdd</span><br><span class="line">// Write doc IDs</span><br><span class="line">int[] docIDs = spareDocIds;</span><br><span class="line">for (int i = from; i < to; ++i) {</span><br><span class="line"> // 获取from->to之间的文档Id</span><br><span class="line"> docIDs[i - from] = reader.getDocID(i); </span><br><span class="line">}</span><br><span class="line"> // 把文档Id给存储起来了</span><br><span class="line">writeLeafBlockDocs(scratchOut, docIDs, 0, count); </span><br><span class="line">// 存储相同的前缀</span><br><span class="line">// Write the common prefixes:</span><br><span class="line"> // copy第一个词</span><br><span class="line">reader.getValue(from, scratchBytesRef1);</span><br><span class="line">System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, packedBytesLength);</span><br><span class="line">// 存储前缀</span><br><span class="line">writeCommonPrefixes(scratchOut, commonPrefixLengths, scratch1); </span><br><span class="line">// Write the full values:</span><br><span class="line">IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {</span><br><span class="line"> @Override</span><br><span class="line"> public BytesRef apply(int i) {</span><br><span class="line"> reader.getValue(from + i, scratchBytesRef1);</span><br><span class="line"> return scratchBytesRef1;</span><br><span class="line"> }</span><br><span class="line">};</span><br><span class="line">// 再写入叶子剩余数据</span><br><span class="line">writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, count, sortedDim, packedValues, leafCardinality);</span><br><span class="line">// // 写入kdd文件</span><br><span class="line">out.writeBytes(scratchOut.getBytes(), 0, scratchOut.getPosition()); </span><br><span class="line">scratchOut.reset();</span><br></pre></td></tr></table></figure>
<p>具体做了如下事情:<br>1.遍历from-to所有point,依次比较每个point同一个维度相同的前缀长度,存放在commonPrefixLengths。<br>2.统计每个维度不相同前缀point的cardinaliy,统计时使用长度为256的FixedBitSet,代表着256个字符。<br>3.遍历每个维度,找出cardinaliy值最小的那个维度sortedDim。<br>4.基于sortedDim维度,调用<code>MutablePointsReaderUtils.sortByDim</code>使用快排原理保证叶子内所有元树在sortedDim维度有序。<br>5.统计from-to个Point的cardinaliy,这里每个维度都要对比。<br>6.存储from-to个排序后Point的docId<br>7.存储每个维度相同的前缀。<br>8.使用<code>BKDWriter.writeLeafBlockPackedValues()</code>存储from-to个具体的point。<br>kdd文件结构如下:<br><img src="https://kkewwei.github.io/elasticsearch_learning/img/bkd5.png" height="100" width="900"></p>
<h2 id="构建kdm和kdi文件"><a href="#构建kdm和kdi文件" class="headerlink" title="构建kdm和kdi文件"></a>构建kdm和kdi文件</h2><p>当对所有point进行排序后,开始存储BKD树的每个子节点和叶子节点,会进入到:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line">private void writeIndex(IndexOutput metaOut, IndexOutput indexOut, int countPerLeaf, BKDTreeLeafNodes leafNodes, long dataStartFP) throws IOException {</span><br><span class="line"> byte[] packedIndex = packIndex(leafNodes);</span><br><span class="line"> writeIndex(metaOut, indexOut, countPerLeaf, leafNodes.numLeaves(), packedIndex, dataStartFP);</span><br><span class="line"> }</span><br></pre></td></tr></table></figure>
<p>该函数主要做了两件事:<br>1.在<code>packIndex</code>中压缩存储BKD树子节点和叶子节点。<br>2.在<code>writeIndex</code>中存储压缩后的数据,及BKD元数据。</p>
<h3 id="压缩转存BKD树"><a href="#压缩转存BKD树" class="headerlink" title="压缩转存BKD树"></a>压缩转存BKD树</h3><p>压缩BKD转存的核心函数是<code>recursePackIndex</code>,采用递归的方式转存,以中序遍历的方式对BKD树进行处理,首先先存储中间飞叶子的信息,然后再分别对左右叶子节点进行处理:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br><span class="line">80</span><br><span class="line">81</span><br><span class="line">82</span><br><span class="line">83</span><br><span class="line">84</span><br><span class="line">85</span><br><span class="line">86</span><br><span class="line">87</span><br><span class="line">88</span><br><span class="line">89</span><br><span class="line">90</span><br><span class="line">91</span><br><span class="line">92</span><br><span class="line">93</span><br><span class="line">94</span><br><span class="line">95</span><br><span class="line">96</span><br><span class="line">97</span><br><span class="line">98</span><br><span class="line">99</span><br><span class="line">100</span><br><span class="line">101</span><br><span class="line">102</span><br><span class="line">103</span><br><span class="line">104</span><br><span class="line">105</span><br><span class="line">106</span><br><span class="line">107</span><br></pre></td><td class="code"><pre><span class="line"> // 到了叶子节点</span><br><span class="line">if (numLeaves == 1) { </span><br><span class="line"> if (isLeft) {</span><br><span class="line"> return 0;</span><br><span class="line"> } else {</span><br><span class="line"> long delta = leafNodes.getLeafLP(leavesOffset) - minBlockFP;</span><br><span class="line"> writeBuffer.writeVLong(delta);</span><br><span class="line"> return appendBlock(writeBuffer, blocks);</span><br><span class="line"> }</span><br><span class="line"> // 不是叶子节点</span><br><span class="line"> } else { </span><br><span class="line"> long leftBlockFP;</span><br><span class="line"> if (isLeft) {</span><br><span class="line"> // 若是左子树,leftBlockFP就是父节点的minBlockFP</span><br><span class="line"> leftBlockFP = minBlockFP;</span><br><span class="line"> } else {</span><br><span class="line"> // 右子树的最小leftBlockFP,就是当前右子树包含的的所有叶子节点的第一个,也就是leavesOffset对应的叶子节点</span><br><span class="line"> leftBlockFP = leafNodes.getLeafLP(leavesOffset); </span><br><span class="line"> long delta = leftBlockFP - minBlockFP;</span><br><span class="line"> assert leafNodes.numLeaves() == numLeaves || delta > 0 : "expected delta > 0; got numLeaves =" + numLeaves + " and delta=" + delta;</span><br><span class="line"> writeBuffer.writeVLong(delta);</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> int numLeftLeafNodes = getNumLeftLeafNodes(numLeaves);</span><br><span class="line"> final int rightOffset = leavesOffset + numLeftLeafNodes;</span><br><span class="line"> final int splitOffset = rightOffset - 1;</span><br><span class="line"> // 和构建时一样,是以哪个维度切分的,然后address指向下一个位置(value值)</span><br><span class="line"> int splitDim = leafNodes.getSplitDimension(splitOffset);</span><br><span class="line"> BytesRef splitValue = leafNodes.getSplitValue(splitOffset);// 这个维度切分时的值</span><br><span class="line"> int address = splitValue.offset;</span><br><span class="line"></span><br><span class="line"> // 查找切分的那个值和之前切分之间的之间相同的前缀</span><br><span class="line"> // find common prefix with last split value in this dim:</span><br><span class="line"> int prefix = FutureArrays.mismatch(splitValue.bytes, address, address + bytesPerDim, lastSplitValues,</span><br><span class="line"> splitDim * bytesPerDim, splitDim * bytesPerDim + bytesPerDim);</span><br><span class="line"> if (prefix == -1) {</span><br><span class="line"> prefix = bytesPerDim;</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> int firstDiffByteDelta;</span><br><span class="line"> if (prefix < bytesPerDim) { // 两次切分的值是不同的</span><br><span class="line"> firstDiffByteDelta = (splitValue.bytes[address+prefix]&0xFF) - (lastSplitValues[splitDim * bytesPerDim + prefix]&0xFF);</span><br><span class="line"> // 第二次作为切分阶度,那么就开始获取diff</span><br><span class="line"> if (negativeDeltas[splitDim]) {</span><br><span class="line"> // 取相反数</span><br><span class="line"> firstDiffByteDelta = -firstDiffByteDelta; </span><br><span class="line"> }</span><br><span class="line"> assert firstDiffByteDelta > 0; </span><br><span class="line"> } else {</span><br><span class="line"> firstDiffByteDelta = 0;</span><br><span class="line"> }</span><br><span class="line"> // 将prefix、splitDim和firstDiffByteDelta打包编码到同一个vint中,也很容易解码出来。见BKDReader.readNodeData()中287行编码</span><br><span class="line"> // pack the prefix, splitDim and delta first diff byte into a single vInt:</span><br><span class="line"> int code = (firstDiffByteDelta * (1+bytesPerDim) + prefix) * numIndexDims + splitDim;</span><br><span class="line"> writeBuffer.writeVInt(code);</span><br><span class="line"></span><br><span class="line"> // write the split value, prefix coded vs. our parent's split value:</span><br><span class="line"> int suffix = bytesPerDim - prefix;</span><br><span class="line"> byte[] savSplitValue = new byte[suffix];</span><br><span class="line"> if (suffix > 1) {// 不完全一样</span><br><span class="line"> // 把这个split分词的那个词后半段存储起来</span><br><span class="line"> writeBuffer.writeBytes(splitValue.bytes, address+prefix+1, suffix-1);</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> byte[] cmp = lastSplitValues.clone(); // 不再是同一个对象</span><br><span class="line"> // 把lastSplitValues中的不相同的后缀全部copy到savSplitValue中</span><br><span class="line"> System.arraycopy(lastSplitValues, splitDim * bytesPerDim + prefix, savSplitValue, 0, suffix); // 临时保存</span><br><span class="line"> // 将splitPackedValues中该不相同的后缀copy,放到lastSplitValues对应的位置</span><br><span class="line"> // copy our split value into lastSplitValues for our children to prefix-code against</span><br><span class="line"> System.arraycopy(splitValue.bytes, address+prefix, lastSplitValues, splitDim * bytesPerDim + prefix, suffix);</span><br><span class="line"> // 将writeBuffer存储的值,以[]byte方式放入blocks中,放的code,词的后半缀</span><br><span class="line"> int numBytes = appendBlock(writeBuffer, blocks); </span><br><span class="line"></span><br><span class="line"> // placeholder for left-tree numBytes; we need this so that at search time if we only need to recurse into the right sub-tree we can</span><br><span class="line"> // quickly seek to its starting point</span><br><span class="line"> int idxSav = blocks.size();</span><br><span class="line"> // 占位符,后面会赋值,会记录左子树的长度</span><br><span class="line"> blocks.add(null); </span><br><span class="line"> // 若我们以splitDim维度进行了切分,那么之后再次在维度</span><br><span class="line"> boolean savNegativeDelta = negativeDeltas[splitDim];</span><br><span class="line"> negativeDeltas[splitDim] = true;</span><br><span class="line"> int leftNumBytes = recursePackIndex(writeBuffer, leafNodes, leftBlockFP, blocks, lastSplitValues, negativeDeltas, true,</span><br><span class="line"> leavesOffset, numLeftLeafNodes);</span><br><span class="line"> if (numLeftLeafNodes != 1) {</span><br><span class="line"> writeBuffer.writeVInt(leftNumBytes);</span><br><span class="line"> } else { // 最左边的那个叶子节点</span><br><span class="line"> assert leftNumBytes == 0: "leftNumBytes=" + leftNumBytes;</span><br><span class="line"> }</span><br><span class="line"> // 存储的leftNumBytes的长度</span><br><span class="line"> int numBytes2 = Math.toIntExact(writeBuffer.getFilePointer());</span><br><span class="line"> byte[] bytes2 = new byte[numBytes2];</span><br><span class="line"> writeBuffer.writeTo(bytes2, 0);</span><br><span class="line"> writeBuffer.reset();</span><br><span class="line"> // replace our placeholder:</span><br><span class="line"> blocks.set(idxSav, bytes2);</span><br><span class="line"></span><br><span class="line"> negativeDeltas[splitDim] = false;// 置位</span><br><span class="line"> int rightNumBytes = recursePackIndex(writeBuffer, leafNodes, leftBlockFP, blocks, lastSplitValues, negativeDeltas, false,</span><br><span class="line"> rightOffset, numLeaves - numLeftLeafNodes);</span><br><span class="line"> // 这里会复位</span><br><span class="line"> negativeDeltas[splitDim] = savNegativeDelta; </span><br><span class="line"> // restore lastSplitValues to what caller originally passed us:</span><br><span class="line"> // 再放回去</span><br><span class="line"> System.arraycopy(savSplitValue, 0, lastSplitValues, splitDim * bytesPerDim + prefix, suffix); </span><br><span class="line"> // 当前非叶子节点存储使用的空间,分中+左右占用</span><br><span class="line"> return numBytes + bytes2.length + leftNumBytes + rightNumBytes;</span><br><span class="line"> }</span><br></pre></td></tr></table></figure>
<p>按照中序遍历的方式存储,比如处理node1节点:<br><img src="https://kkewwei.github.io/elasticsearch_learning/img/bkd4.png" height="550" width="570"><br>其中:<br>deltaFP:leftBlockFP - minBlockFP,minBlockFP是父节点最左边的子节点,leftBlockFP是该节点的子节点。<br>code: (firstDiffByteDelta * (1+bytesPerDim) + prefix) * numIndexDims + splitDim,firstDiffByteDelta是当前节点切分维度的value-该相同维度上一个父节点切分的value。这里采用了编码,使之存储三个数值。</p>
<p>最终,BKD树存储在了数组blocks中。</p>
<h3 id="存储bkm和bki文件"><a href="#存储bkm和bki文件" class="headerlink" title="存储bkm和bki文件"></a>存储bkm和bki文件</h3><p>在<code>BKDWriter.writeIndex</code>文件中,bki文件存储了bkd树转存后的blocks的二进制数,而bkm文件存储了BKD树的元数据信息:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br></pre></td><td class="code"><pre><span class="line">private void writeIndex(IndexOutput metaOut, IndexOutput indexOut, int countPerLeaf, int numLeaves, byte[] packedIndex, long dataStartFP) throws IOException {</span><br><span class="line"> // metaOut: kdm文件写入</span><br><span class="line"> CodecUtil.writeHeader(metaOut, CODEC_NAME, VERSION_CURRENT);</span><br><span class="line"> metaOut.writeVInt(numDataDims);</span><br><span class="line"> metaOut.writeVInt(numIndexDims);</span><br><span class="line"> // 每个页节点的point个数</span><br><span class="line"> metaOut.writeVInt(countPerLeaf);</span><br><span class="line"> metaOut.writeVInt(bytesPerDim);</span><br><span class="line"></span><br><span class="line"> // 统计该树每个维度最大最小值</span><br><span class="line"> metaOut.writeVInt(numLeaves);</span><br><span class="line"> metaOut.writeBytes(minPackedValue, 0, packedIndexBytesLength);</span><br><span class="line"> metaOut.writeBytes(maxPackedValue, 0, packedIndexBytesLength);</span><br><span class="line"></span><br><span class="line"> metaOut.writeVLong(pointCount);</span><br><span class="line"> metaOut.writeVInt(docsSeen.cardinality());</span><br><span class="line"> // 把非叶子节点在文件中存储位置给读取出来</span><br><span class="line"> metaOut.writeVInt(packedIndex.length);</span><br><span class="line"> metaOut.writeLong(dataStartFP);</span><br><span class="line"> // If metaOut and indexOut are the same file, we account for the fact that</span><br><span class="line"> // writing a long makes the index start 8 bytes later.</span><br><span class="line"> metaOut.writeLong(indexOut.getFilePointer() + (metaOut == indexOut ? Long.BYTES : 0));</span><br><span class="line"> // kdi文件</span><br><span class="line"> indexOut.writeBytes(packedIndex, 0, packedIndex.length);</span><br><span class="line"> }</span><br></pre></td></tr></table></figure>
<h1 id="总结"><a href="#总结" class="headerlink" title="总结"></a>总结</h1><p>BKD树主要运用在范围多维查找,在空间上,按照完全二叉树结构,将数据分为左右两部分,找到所有point每个维度[min,max]差距最大的维度,在该维度按照照左子树完全小于中间值,右子树完全大于间的值。通过范围查找,能够快速定位出docId。</p>
</div>
<div class="article-info article-info-index">
<div class="article-tag tagcloud">
<i class="icon-price-tags icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="javascript:void(0)" class="js-tag article-tag-list-link color3">Lucene、BKW树、Point</a>
</li>
</ul>
</div>
<div class="article-category tagcloud">
<i class="icon-book icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="/elasticsearch_learning/categories/Lucene//" class="article-tag-list-link color2">Lucene</a>
</li>
</ul>
</div>
<p class="article-more-link">
<a class="article-more-a" href="/elasticsearch_learning/2020/11/01/Lucene8-6-2底层架构-BKW树构建过程/">展开全文 >></a>
</p>
<div class="clearfix"></div>
</div>
</div>
</article>
<aside class="wrap-side-operation">
<div class="mod-side-operation">
<div class="jump-container" id="js-jump-container" style="display:none;">
<a href="javascript:void(0)" class="mod-side-operation__jump-to-top">
<i class="icon-font icon-back"></i>
</a>
<div id="js-jump-plan-container" class="jump-plan-container" style="top: -11px;">
<i class="icon-font icon-plane jump-plane"></i>
</div>
</div>
</div>
</aside>
<article id="post-ES7-9-1-publish原理详解" class="article article-type-post article-index" itemscope itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/elasticsearch_learning/2020/08/04/ES7-9-1-publish原理详解/">ES7.9.1 publish原理详解</a>
</h1>
<a href="/elasticsearch_learning/2020/08/04/ES7-9-1-publish原理详解/" class="archive-article-date">
<time datetime="2020-08-04T09:10:06.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2020-08-04</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
<p>ES集群中,master负责实施维护集群元数据的更新,然后再分发给data节点,分发的过程就是publish,也就是本文的重点。本文将以索引创建的流程来讲解这个过程。</p>
<h1 id="master首先创建新的集群元数据"><a href="#master首先创建新的集群元数据" class="headerlink" title="master首先创建新的集群元数据"></a>master首先创建新的集群元数据</h1><p>当master接收到创建索引的请求后,首先进入如下MetadataCreateIndexService.onlyCreateIndex()函数:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br></pre></td><td class="code"><pre><span class="line">private void onlyCreateIndex(final CreateIndexClusterStateUpdateRequest request,</span><br><span class="line"> final ActionListener<ClusterStateUpdateResponse> listener) {</span><br><span class="line"> clusterService.submitStateUpdateTask(</span><br><span class="line"> "create-index [" + request.index() + "], cause [" + request.cause() + "]",</span><br><span class="line"> // 每个create使用这惟一的key</span><br><span class="line"> new AckedClusterStateUpdateTask<ClusterStateUpdateResponse>(Priority.URGENT, request, listener) { </span><br><span class="line"> protected ClusterStateUpdateResponse newResponse(boolean acknowledged) {</span><br><span class="line"> return new ClusterStateUpdateResponse(acknowledged);</span><br><span class="line"> }</span><br><span class="line"> @Override</span><br><span class="line"> public ClusterState execute(ClusterState currentState) throws Exception {</span><br><span class="line"> // 产生新的ClusterState,还没成为本地的ClusterState</span><br><span class="line"> return applyCreateIndexRequest(currentState, request, false); </span><br><span class="line"> }</span><br><span class="line"> });</span><br><span class="line"> }</span><br></pre></td></tr></table></figure>
<p>master进行任何更新State的操作时,都会调用submitStateUpdateTask() -> ClusterService.submitStateUpdateTasks() -> MasterService.submitStateUpdateTasks() -> TaskBatcher.submitTasks(),在submitTasks()中会对tasks分类合并,有些task可以合并执行以加快速度:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br></pre></td><td class="code"><pre><span class="line">public void submitTasks(List<? extends BatchedTask> tasks, @Nullable TimeValue timeout) throws EsRejectedExecutionException {</span><br><span class="line"> final BatchedTask firstTask = tasks.get(0);</span><br><span class="line"> final Map<Object, BatchedTask> tasksIdentity = tasks.stream().collect(Collectors.toMap( </span><br><span class="line"> BatchedTask::getTask,</span><br><span class="line"> Function.identity(),</span><br><span class="line"> (a, b) -> { throw new IllegalStateException("cannot add duplicate task: " + a); },</span><br><span class="line"> IdentityHashMap::new));</span><br><span class="line"> //比如多个shartedShard过后来,都会先锁着。</span><br><span class="line"> synchronized (tasksPerBatchingKey) { </span><br><span class="line"> LinkedHashSet<BatchedTask> existingTasks = tasksPerBatchingKey.computeIfAbsent(firstTask.batchingKey,</span><br><span class="line"> k -> new LinkedHashSet<>(tasks.size()));</span><br><span class="line"> // 对应存量的</span><br><span class="line"> existingTasks.addAll(tasks);</span><br><span class="line"> }</span><br><span class="line"> if (timeout != null) {</span><br><span class="line"> threadExecutor.execute(firstTask, timeout, () -> onTimeoutInternal(tasks, timeout)); // 这个类就是[node][masterService#updateTask][T#1]</span><br><span class="line"> } else {</span><br><span class="line"> threadExecutor.execute(firstTask);</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>可以看到,该函数会去检查task的batchingKey是否一致,若一致的话,放在相同的batchingKey下,最常见的startShard/failShard元数据更新就是可以合并执行,batchingKey相同的前提是调用ClusterService.submitStateUpdateTask时,使用了相同的ClusterStateTaskExecutor,而startShard使用了全局唯一的ShardStartedClusterStateTaskExecutor作为key。在create中,我们明显可以看到每个index创建都会产生新的AckedClusterStateUpdateTask作为batchingKey,索引创建流程只能逐个全局同步。这里会从线程池中产生我们熟悉的[node][masterService#updateTask][T#1]线程进行构造新的ClusterStae。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br></pre></td><td class="code"><pre><span class="line">private void runTasks(TaskInputs taskInputs) {</span><br><span class="line"> final ClusterState previousClusterState = state();</span><br><span class="line"> final long computationStartTime = threadPool.relativeTimeInMillis();</span><br><span class="line"> // 去执行每个task如何产生新的ClusterState</span><br><span class="line"> final TaskOutputs taskOutputs = calculateTaskOutputs(taskInputs, previousClusterState);</span><br><span class="line"> taskOutputs.notifyFailedTasks();</span><br><span class="line"> final TimeValue computationTime = getTimeSince(computationStartTime);</span><br><span class="line"> logExecutionTime(computationTime, "compute cluster state update", summary);</span><br><span class="line"> if (taskOutputs.clusterStateUnchanged()) {</span><br><span class="line"> ......</span><br><span class="line"> } else {</span><br><span class="line"> final ClusterState newClusterState = taskOutputs.newClusterState;</span><br><span class="line"> final long publicationStartTime = threadPool.relativeTimeInMillis();</span><br><span class="line"> try {</span><br><span class="line"> ClusterChangedEvent clusterChangedEvent = new ClusterChangedEvent(summary, newClusterState, previousClusterState);</span><br><span class="line"> // 这里回去进行真正广播</span><br><span class="line"> publish(clusterChangedEvent, taskOutputs, publicationStartTime); </span><br><span class="line"> } catch (Exception e) {</span><br><span class="line"> handleException(summary, publicationStartTime, newClusterState, e);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>这个函数主要做了如下事情:<br>1.调用calculateTaskOutputs以产生新的ClusterState。产生过程可以参考前面自定义类的AckedClusterStateUpdateTask.execute调用applyCreateIndexRequest产生新的集群状态,这里将不是本文重点。<br>2.调用publish()进行全局广播,全局广播包括主master本身。</p>
<h1 id="master全局广播"><a href="#master全局广播" class="headerlink" title="master全局广播"></a>master全局广播</h1><h2 id="预处理"><a href="#预处理" class="headerlink" title="预处理"></a>预处理</h2><p>进行广播前,master还会做如下预处理:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br></pre></td><td class="code"><pre><span class="line">public void publish(ClusterChangedEvent clusterChangedEvent, ActionListener<Void> publishListener, AckListener ackListener) {</span><br><span class="line"> final PublishRequest publishRequest = coordinationState.get().handleClientValue(clusterState);</span><br><span class="line"> final CoordinatorPublication publication = new CoordinatorPublication(publishRequest, publicationContext,</span><br><span class="line"> new ListenableFuture<>(), ackListener, publishListener);</span><br><span class="line"> currentPublication = Optional.of(publication);</span><br><span class="line"> // 从最新的集群状态中的所有的节点</span><br><span class="line"> final DiscoveryNodes publishNodes = publishRequest.getAcceptedState().nodes();</span><br><span class="line"> // master仅仅更新下数据节点列表。当master接收到data的心跳时会校验</span><br><span class="line"> leaderChecker.setCurrentNodes(publishNodes); </span><br><span class="line"> // master会更新本地维持的、对数据节点的心跳连接</span><br><span class="line"> followersChecker.setCurrentNodes(publishNodes);</span><br><span class="line"> // master更新本地需要维护的、数据节点响应的ClusterVersion版本号,便于剔除对版本超时未同步的节点。 </span><br><span class="line"> lagDetector.setTrackedNodes(publishNodes);</span><br><span class="line"> // 真正开始publish,需要注意的是`followersChecker.getFaultyNodes()`记录的是心跳超时重试未成功的节点,此时同步时直接置位这些节点PublicationTargetState状态为failed</span><br><span class="line"> publication.start(followersChecker.getFaultyNodes()); </span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>在CoordinatorPublication初始化时,会针对整个广播设置cancel超时时间(cluster.publish.timeout publish,默认30s)+info超时(cluster.publish.info_timeout, 默认10s),info超时日志如下:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">[INFO ] after [10s] publication of cluster state version [407258] is still waiting for {node2}[SENT_APPLY_COMMIT], {node1}[SENT_APPLY_COMMIT]</span><br></pre></td></tr></table></figure>
<p>info超时会打印master还没接收到的commit响应的所有节点。cancel超时后,会将整个publish过程置为 cancelled+isCompleted。对于还未完成第二次响应的节点,直接置为失败,master会直接进入整个publish收尾阶段。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br></pre></td><td class="code"><pre><span class="line">public void cancel(String reason) {</span><br><span class="line"> if (isCompleted) {</span><br><span class="line"> return;</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> assert cancelled == false;</span><br><span class="line"> cancelled = true; // 那么会先去取消</span><br><span class="line"> // 若第一轮响应的master节点还没超过一半(那么直接置位失败)</span><br><span class="line"> if (applyCommitRequest.isPresent() == false) { </span><br><span class="line"> final Exception e = new ElasticsearchException("publication cancelled before committing: " + reason);</span><br><span class="line"> // 标记还未成的PublicationTarget为失败</span><br><span class="line"> publicationTargets.stream().filter(PublicationTarget::isActive).forEach(pt -> pt.setFailed(e));</span><br><span class="line"> }</span><br><span class="line"> onPossibleCompletion(); </span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>onPossibleCompletion()会在后面介绍。</p>
<h2 id="向每个节点广播请求"><a href="#向每个节点广播请求" class="headerlink" title="向每个节点广播请求"></a>向每个节点广播请求</h2><p>开始对每个数据节点发送元数据请求:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line">void sendPublishRequest() {</span><br><span class="line"> if (isFailed()) {</span><br><span class="line"> return;</span><br><span class="line"> }</span><br><span class="line"> state = PublicationTargetState.SENT_PUBLISH_REQUEST;</span><br><span class="line"> Publication.this.sendPublishRequest(discoveryNode, publishRequest, new PublishResponseHandler());</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>master在对每个目标节点(包含master节点本身)发送publish前,会分别对每个目标节点构建PublicationTarget对象,来跟踪publish state,表明当前节点的publish进行到了哪个阶段:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line">enum PublicationTargetState {</span><br><span class="line"> NOT_STARTED, // 对象初始化时的状态</span><br><span class="line"> FAILED, //该目标节点的publish失败了,比如超时,或者节点异常等</span><br><span class="line"> SENT_PUBLISH_REQUEST, // master已经向数据节点发送了第一次的publish_request</span><br><span class="line"> WAITING_FOR_QUORUM, // 数据节点已经响应了master,但是masater在等待第二次commit的条件</span><br><span class="line"> SENT_APPLY_COMMIT, // master已经向数据节点发送了commit请求</span><br><span class="line"> APPLIED_COMMIT, // master收到了数据节点对commit请求的响应</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>当对目标节点置位SENT_PUBLISH_REQUEST后,进入PublicationContext.sendPublishRequest()进行clusterState的发送。同时定义了PublishResponseHandler作为master响应目前节点第一次response的处理类。在sendPublishRequest时,ES会判断是否向目标节点发送全量ClusterState还是仅仅发送diff的ClusterState。最常见的全量发布ClusterState的情况就是有新的节点加入到集群。</p>
<h1 id="目标节点接收到maser发送的publish请求"><a href="#目标节点接收到maser发送的publish请求" class="headerlink" title="目标节点接收到maser发送的publish请求"></a>目标节点接收到maser发送的publish请求</h1><p>目标节点的PublicationTransportHandler.handleIncomingPublishRequest首先接收到master发送的请求,做了以下三件事情:<br>1.然后第一步就是解析出最新的ClusterState。<br>2.其次进入acceptState()->CoordinationState.handlePublishRequest()构建响应master的response。<br>3.再次调用becomeFollower()变身Follow(非master本身)。<br>我们看下第二步构建Response时做了哪些事情:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">public PublishResponse handlePublishRequest(PublishRequest publishRequest) {</span><br><span class="line"> final ClusterState clusterState = publishRequest.getAcceptedState();</span><br><span class="line"> persistedState.setLastAcceptedState(clusterState);</span><br><span class="line"> return new PublishResponse(clusterState.term(), clusterState.version());</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>该函数主要做了如下事情:将接受到的集群元数据进行落盘。针对不同角色的节点,persistedState也不同:<br>1.目标节点为masters属性的角色时,persistedState=LucenePersistedState。<br>2.目标节点为仅仅为data属性的角色时,persistedState=AsyncLucenePersistedState。<br>这里还有lastSeenClusterState元数据,它的作用仅仅值是为了第二次接受到commit请求时做版本等校验用的。并不会作为接收到的临时元数据使用。<br>这里字面上可以知道:对master节点,对于ClusterState落盘时同步操作,若IO压力大的话,对落盘相当耗时,会拖累整个集群publish耗时;对数据节点,采用异步落盘的方式,避免阻塞整个落盘导致的响应超时。我们看下如何落盘,以及落盘落了哪些信息:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br></pre></td><td class="code"><pre><span class="line">void writeIncrementalStateAndCommit(long currentTerm, ClusterState previousClusterState,</span><br><span class="line"> ClusterState clusterState) throws IOException {</span><br><span class="line"> try {</span><br><span class="line"> final long startTimeMillis = relativeTimeMillisSupplier.getAsLong();</span><br><span class="line"> // 进行lucene.flush()构建segment</span><br><span class="line"> final WriterStats stats = updateMetadata(previousClusterState.metadata(), clusterState.metadata());</span><br><span class="line"> // 进行lucene.flush()刷盘</span><br><span class="line"> commit(currentTerm, clusterState.version()); </span><br><span class="line"> final long durationMillis = relativeTimeMillisSupplier.getAsLong() - startTimeMillis;</span><br><span class="line"> final TimeValue finalSlowWriteLoggingThreshold = slowWriteLoggingThresholdSupplier.get();</span><br><span class="line"> // check是否超时</span><br><span class="line"> if (durationMillis >= finalSlowWriteLoggingThreshold.getMillis()) {</span><br><span class="line"> logger.warn("writing cluster state took [{}ms] which is above the warn threshold of [{}]; " +</span><br><span class="line"> "wrote global metadata [{}] and metadata for [{}] indices and skipped [{}] unchanged indices",</span><br><span class="line"> durationMillis, finalSlowWriteLoggingThreshold, stats.globalMetaUpdated, stats.numIndicesUpdated,</span><br><span class="line"> stats.numIndicesUnchanged);</span><br><span class="line"> } else {</span><br><span class="line"> ......</span><br><span class="line"> }</span><br><span class="line"> } finally {</span><br><span class="line"> closeIfAnyIndexWriterHasTragedyOrIsClosed();</span><br><span class="line"> }</span><br><span class="line">}</span><br><span class="line">private WriterStats updateMetadata(Metadata previouslyWrittenMetadata, Metadata metadata) throws IOException {</span><br><span class="line"> // globalMeta是否有发生变化</span><br><span class="line"> final boolean updateGlobalMeta = Metadata.isGlobalStateEquals(previouslyWrittenMetadata, metadata) == false;</span><br><span class="line"> // 若变化了,那么全部覆盖掉</span><br><span class="line"> if (updateGlobalMeta) { </span><br><span class="line"> // 重新产生gloabal Metadata</span><br><span class="line"> try (ReleasableDocument globalMetadataDocument = makeGlobalMetadataDocument(metadata)) {</span><br><span class="line"> // 配置的数据盘,每个盘都会写一份</span><br><span class="line"> for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {</span><br><span class="line"> metadataIndexWriter.updateGlobalMetadata(globalMetadataDocument.getDocument());</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> // 获取旧的索引version</span><br><span class="line"> final Map<String, Long> indexMetadataVersionByUUID = new HashMap<>(previouslyWrittenMetadata.indices().size());</span><br><span class="line"> for (ObjectCursor<IndexMetadata> cursor : previouslyWrittenMetadata.indices().values()) {</span><br><span class="line"> final IndexMetadata indexMetadata = cursor.value;</span><br><span class="line"> final Long previousValue = indexMetadataVersionByUUID.putIfAbsent(indexMetadata.getIndexUUID(), indexMetadata.getVersion());</span><br><span class="line"> }</span><br><span class="line"> int numIndicesUpdated = 0;</span><br><span class="line"> int numIndicesUnchanged = 0;</span><br><span class="line"> // 遍历新的IndexMetadata</span><br><span class="line"> for (ObjectCursor<IndexMetadata> cursor : metadata.indices().values()) {</span><br><span class="line"> final IndexMetadata indexMetadata = cursor.value;</span><br><span class="line"> final Long previousVersion = indexMetadataVersionByUUID.get(indexMetadata.getIndexUUID());</span><br><span class="line"> // IndexMetadata新创建的,或者发生了改变</span><br><span class="line"> if (previousVersion == null || indexMetadata.getVersion() != previousVersion) { </span><br><span class="line"> logger.trace("updating metadata for [{}], changing version from [{}] to [{}]",</span><br><span class="line"> indexMetadata.getIndex(), previousVersion, indexMetadata.getVersion());</span><br><span class="line"> numIndicesUpdated++;</span><br><span class="line"> try (ReleasableDocument indexMetadataDocument = makeIndexMetadataDocument(indexMetadata)) {</span><br><span class="line"> for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {</span><br><span class="line"> metadataIndexWriter.updateIndexMetadataDocument(indexMetadataDocument.getDocument(), indexMetadata.getIndex());</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> } else { </span><br><span class="line"> numIndicesUnchanged++;</span><br><span class="line"> }</span><br><span class="line"> indexMetadataVersionByUUID.remove(indexMetadata.getIndexUUID());</span><br><span class="line"> }</span><br><span class="line"> // 存在旧的,但是没有在新的中存在了,那么就是被删除了。</span><br><span class="line"> for (String removedIndexUUID : indexMetadataVersionByUUID.keySet()) {</span><br><span class="line"> for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {</span><br><span class="line"> metadataIndexWriter.deleteIndexMetadata(removedIndexUUID);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> // 去flush一次</span><br><span class="line"> // Flush, to try and expose a failure (e.g. out of disk space) before committing, because we can handle a failure here more</span><br><span class="line"> // gracefully than one that occurs during the commit process.</span><br><span class="line"> for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {</span><br><span class="line"> metadataIndexWriter.flush();</span><br><span class="line"> }</span><br><span class="line"> return new WriterStats(updateGlobalMeta, numIndicesUpdated, numIndicesUnchanged);</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>需要知道的是:<br>1.在global Metadate中,并没有存储所有indexMetadata,而是作为单独一项存储的。<br>2.存储的时候回遍历每个data.path分别都存储一份。所以在线上环境,我们需要严格将data和master节点区分开,以免data角色将磁盘IO占用过多,而影响元数据的同步落盘操作。<br>注意:数据节点仅仅是将集群元数据保存在了本地,并更新了,但是还没有真正合并到data节点当前使用的ClusterState中。真正将新的ClusterState当成本地元数据,是在接收到master发送的commit请求后。</p>
<h1 id="master接收到目标节点发送的publish响应"><a href="#master接收到目标节点发送的publish响应" class="headerlink" title="master接收到目标节点发送的publish响应"></a>master接收到目标节点发送的publish响应</h1><p>master接收到data响应的响应是在Publication$PublicationTarget$PublishResponseHandler.onResponse(),首先将对该节点publish请求状态置为WAITING_FOR_QUORUM,然后进入PublicationTarget.handlePublishResponse()</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br></pre></td><td class="code"><pre><span class="line">void handlePublishResponse(PublishResponse publishResponse) {</span><br><span class="line"> // master属性节点响应过半后,才会赋值。接着直接第二次commit</span><br><span class="line"> if (applyCommitRequest.isPresent()) { </span><br><span class="line"> sendApplyCommit();</span><br><span class="line"> } else {</span><br><span class="line"> // master检查是否有资格发送commit请求</span><br><span class="line"> try {</span><br><span class="line"> // 响应节点过半的,继续执行</span><br><span class="line"> Publication.this.handlePublishResponse(discoveryNode, publishResponse).ifPresent(applyCommit -> { </span><br><span class="line"> assert applyCommitRequest.isPresent() == false;</span><br><span class="line"> applyCommitRequest = Optional.of(applyCommit);</span><br><span class="line"> ackListener.onCommit(TimeValue.timeValueMillis(currentTimeSupplier.getAsLong() - startTime)); </span><br><span class="line"> // master对第一次响应的节点(状态为WAITING_FOR_QUORUM)开始进行第二次commit</span><br><span class="line"> publicationTargets.stream().filter(PublicationTarget::isWaitingForQuorum)</span><br><span class="line"> .forEach(PublicationTarget::sendApplyCommit);</span><br><span class="line"> });</span><br><span class="line"> } catch (Exception e) {</span><br><span class="line"> setFailed(e);</span><br><span class="line"> onPossibleCommitFailure();</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>该函数主要做了如下事情:<br>1.首先检查是否已经对某些节点发送了applyCommitRequest请求。master可以对数据节点发送applyCommitRequest是有条件的:必须有一半的master属性的节点已经响应了。(raft协议的特性)。<br>2.若master还没有发送过applyCommitRequest请求,那么会检查是否有资格可以对data节点发送第二次commit请求了。若有资格发送了,那么对所有状态为WAITING_FOR_QUORUM的节发送commit请求。</p>
<h1 id="master向目标节点发送二次commit请求"><a href="#master向目标节点发送二次commit请求" class="headerlink" title="master向目标节点发送二次commit请求"></a>master向目标节点发送二次commit请求</h1><p>master收到过半master属性的第一次response请求后,开始对WAITING_FOR_QUORUM状态的节点发送commit请求:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">void sendApplyCommit() {</span><br><span class="line"> //对目标节点发送状态置为SENT_APPLY_COMMIT</span><br><span class="line"> state = PublicationTargetState.SENT_APPLY_COMMIT; </span><br><span class="line"> Publication.this.sendApplyCommit(discoveryNode, applyCommitRequest.get(), new ApplyCommitResponseHandler());</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<h1 id="目标节点接收到master发送的二次commit请求"><a href="#目标节点接收到master发送的二次commit请求" class="headerlink" title="目标节点接收到master发送的二次commit请求"></a>目标节点接收到master发送的二次commit请求</h1><p>目前目标节点收到master发送的commit请求后,首先进入了Coordinator.handleApplyCommit()</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br></pre></td><td class="code"><pre><span class="line">private void handleApplyCommit(ApplyCommitRequest applyCommitRequest, ActionListener<Void> applyListener) {</span><br><span class="line"> synchronized (mutex) {</span><br><span class="line"> // master节点收到本节点的commit响应</span><br><span class="line"> if (applyCommitRequest.getSourceNode().equals(getLocalNode())) { </span><br><span class="line"> // master合并元数据到全局将在收到所有数据节commit响应后(具体见CoordinatorPublication.onCompletion()),将跑到transportCommitCallback</span><br><span class="line"> applyListener.onResponse(null); </span><br><span class="line"> } else { // 数据节点收到master发送的commit请求</span><br><span class="line"> clusterApplier.onNewClusterState(applyCommitRequest.toString(), () -> applierState,</span><br><span class="line"> new ClusterApplyListener() {</span><br><span class="line"></span><br><span class="line"> @Override</span><br><span class="line"> public void onFailure(String source, Exception e) {</span><br><span class="line"> applyListener.onFailure(e); // 将跑到PublicationTransportHandler.transportCommitCallback</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> @Override</span><br><span class="line"> public void onSuccess(String source) {</span><br><span class="line"> applyListener.onResponse(null);// 将跑到PublicationTransportHandler.transportCommitCallback</span><br><span class="line"> }</span><br><span class="line"> });</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>针对不同角色,目标节点做了不同的反应:<br>1.若本节点就是主master节点,那么调用PublicationTransportHandler.transportCommitCallback(),作用仅仅是响应回去。 master只有在整个publish()完成后,才会将新元数据作为本地的全局元数据(后面会讲)。<br>2.若本目标节点是非主master节点,则调用ClusterApplierService.onNewClusterState将新ClusterState节点作为本节点维持的最新全局ClusterState。<br>我们看下本目标节点在替换元数据的时候哪些事情,实际进入的是ClusterApplierService.runTask()</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br></pre></td><td class="code"><pre><span class="line">private void runTask(UpdateTask task) {</span><br><span class="line"> final ClusterState previousClusterState = state.get();</span><br><span class="line"> long startTimeMS = currentTimeInMillis();</span><br><span class="line"> final StopWatch stopWatch = new StopWatch();</span><br><span class="line"> final ClusterState newClusterState;</span><br><span class="line"> try {</span><br><span class="line"> try (Releasable ignored = stopWatch.timing("running task [" + task.source + ']')) {</span><br><span class="line"> // 直接获取的是最新ClusterState</span><br><span class="line"> newClusterState = task.apply(previousClusterState); </span><br><span class="line"> }</span><br><span class="line"> } catch (Exception e) {</span><br><span class="line"> ......</span><br><span class="line"> return;</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> if (previousClusterState == newClusterState) {</span><br><span class="line"> TimeValue executionTime = TimeValue.timeValueMillis(Math.max(0, currentTimeInMillis() - startTimeMS));</span><br><span class="line"> warnAboutSlowTaskIfNeeded(executionTime, task.source, stopWatch);</span><br><span class="line"> task.listener.onSuccess(task.source);</span><br><span class="line"> } else {</span><br><span class="line"> try {// 超级重要,当集群元数据修改后,会去做一系列检查,比如创建索引等,将分配给本节点的分片状态置位started等</span><br><span class="line"> applyChanges(task, previousClusterState, newClusterState, stopWatch);</span><br><span class="line"> // 会去调用PublicationTransportHandler.transportCommitCallback(),直接响应主master</span><br><span class="line"> task.listener.onSuccess(task.source); </span><br><span class="line"> } catch (Exception e) {</span><br><span class="line"> ......</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>非主master节点主要做了如下事情:<br>1.获取的是最新ClusterState。<br>2.检查新旧ClusterState是否一致,若一致,则不做任何操作。<br>3.若旧ClusterState有变化,则调用applyChanges(),根据最新ClusterState适配本地。</p>
<p>我们再看下applyChanges()如何适配本地的</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br></pre></td><td class="code"><pre><span class="line">private void applyChanges(UpdateTask task, ClusterState previousClusterState, ClusterState newClusterState, StopWatch stopWatch) {</span><br><span class="line"> ClusterChangedEvent clusterChangedEvent = new ClusterChangedEvent(task.source, newClusterState, previousClusterState);</span><br><span class="line"> final DiscoveryNodes.Delta nodesDelta = clusterChangedEvent.nodesDelta();</span><br><span class="line"> // 比如节点个数发生了变化,那么就跑到这里</span><br><span class="line"> if (nodesDelta.hasChanges() && logger.isInfoEnabled()) {</span><br><span class="line"> String summary = nodesDelta.shortSummary();</span><br><span class="line"> if (summary.length() > 0) {</span><br><span class="line"> logger.info("{}, term: {}, version: {}, reason: {}",</span><br><span class="line"> summary, newClusterState.term(), newClusterState.version(), task.source);</span><br><span class="line"> } // 会打印 removed {{、added {{日志</span><br><span class="line"> } // 若added,那么已经认同加入集群了</span><br><span class="line"></span><br><span class="line"> try (Releasable ignored = stopWatch.timing("connecting to new nodes")) {</span><br><span class="line"> connectToNodesAndWait(newClusterState);</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> callClusterStateAppliers(clusterChangedEvent, stopWatch);</span><br><span class="line"> </span><br><span class="line"> nodeConnectionsService.disconnectFromNodesExcept(newClusterState.nodes());</span><br><span class="line"></span><br><span class="line"> state.set(newClusterState);</span><br><span class="line"> // 这里也比较重要,会去等待新的集群状态,然后触发某些操作(比如请求集群状态,但是此时没有maser,可见TransportMasterNodeAction$AsyncSingleAction.retry())</span><br><span class="line"> callClusterStateListeners(clusterChangedEvent, stopWatch);</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>非主master节点主要做了如下事情:<br>1.检查是否有节点掉线&新增,对于新增节点主动进行connect,同时打印如下日志:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">[2020-08-10T12:12:22,781][INFO ][o.e.c.s.ClusterApplierService] [node1] added {{node2}}, term: 28, version: 483578, reason: ApplyCommitRequest{term=28, version=483578, sourceNode={master}}</span><br></pre></td></tr></table></figure>
<p>2.本地调用callClusterStateAppliers()根据最新的ClusterState做一些操作,比如创建IndexService,删除索引数据,分配分片等操作。我们看下其中重要实现:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line">private void callClusterStateAppliers(ClusterChangedEvent clusterChangedEvent, StopWatch stopWatch) {</span><br><span class="line"> clusterStateAppliers.forEach(applier -> {</span><br><span class="line"> try (Releasable ignored = stopWatch.timing("running applier [" + applier + "]")) {</span><br><span class="line"> applier.applyClusterState(clusterChangedEvent);</span><br><span class="line"> }</span><br><span class="line"> });</span><br><span class="line"> }</span><br></pre></td></tr></table></figure>
<p>clusterStateAppliers={highPriorityStateAppliers, normalPriorityStateAppliers, lowPriorityStateAppliers},我们需要着重强调下highPriorityStateAppliers中的IndicesClusterStateService.applyClusterState():</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br></pre></td><td class="code"><pre><span class="line">public synchronized void applyClusterState(final ClusterChangedEvent event) {</span><br><span class="line"> final ClusterState state = event.state();</span><br><span class="line"> updateFailedShardsCache(state);</span><br><span class="line"> deleteIndices(event); // also deletes shards of deleted indices</span><br><span class="line"> removeIndices(event); // also removes shards of removed indices</span><br><span class="line"> failMissingShards(state);</span><br><span class="line"> // 删除被删掉的索引的shard</span><br><span class="line"> removeShards(state); // removes any local shards that doesn't match what the master expects</span><br><span class="line"> // 本地更新本地索引元数据</span><br><span class="line"> updateIndices(event); // can also fail shards, but these are then guaranteed to be in failedShardsCache</span><br><span class="line"> // 在本地创建索引元数据</span><br><span class="line"> createIndices(state);</span><br><span class="line"> // 恢复或者创建分片</span><br><span class="line"> createOrUpdateShards(state);</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>非master节点的本地维护元数据将在可以根据全局ClusterState进行及时调整。<br>3.关闭掉线节点的连接。<br>4.设置本地维护的最新全局ClusteState,存放在ClusterApplierService.state对象中。<br>5.调用callClusterStateListeners()来进行回调响应,比如数据节点需要请求master时,发现本地找不到master,那么就会创建一个listener,等待本地维护的ClusteState发生变化时,再去retry;又比如本节点是master,更新元数据后不是master,会做一些收尾处理等。</p>
<h1 id="master接收到目标节点发送二次commit响应"><a href="#master接收到目标节点发送二次commit响应" class="headerlink" title="master接收到目标节点发送二次commit响应"></a>master接收到目标节点发送二次commit响应</h1><p>master收到目标节点的二次响应后,最先进入ApplyCommitResponseHandler.onResponse()函数:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre></td><td class="code"><pre><span class="line">public void onResponse(TransportResponse.Empty ignored) {</span><br><span class="line"> if (isFailed()) {</span><br><span class="line"> return;</span><br><span class="line"> }</span><br><span class="line"> // 修改这个确定的二次确认为已完成,</span><br><span class="line"> setAppliedCommit(); </span><br><span class="line"> // 同时检查是不是所有节点都二次响应</span><br><span class="line"> onPossibleCompletion(); </span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>master主要做了如下操作:<br>1.确认针对目标节点的二次响应完成:修改目标节点的publish状态为APPLIED_COMMIT;进入CoordinatorPublication构造函数的lister中更新master维护的每个数据节点最新ClusterState version。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br></pre></td><td class="code"><pre><span class="line">public void onNodeAck(DiscoveryNode node, Exception e) {</span><br><span class="line"> // acking and cluster state application for local node is handled specially</span><br><span class="line"> //本节点是master即为响应节点</span><br><span class="line"> if (node.equals(getLocalNode())) { </span><br><span class="line"> synchronized (mutex) {</span><br><span class="line"> if (e == null) {</span><br><span class="line"> // master本身第二次确认完成, 仅仅设置localNodeAckEvent为done</span><br><span class="line"> localNodeAckEvent.onResponse(null);</span><br><span class="line"> } else {</span><br><span class="line"> localNodeAckEvent.onFailure(e);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> } else {// 响应节点为非master节点 </span><br><span class="line"> // 会跑到 AckCountDownListener.onNodeAck()里面检查是否全部全部节点ack。数据节点不会finish</span><br><span class="line"> ackListener.onNodeAck(node, e);</span><br><span class="line"> // 这里比较重要,会去更新本节点维护的数据节点的version,若version落后超时,会有惩罚机制</span><br><span class="line"> if (e == null) { </span><br><span class="line"> lagDetector.setAppliedVersion(node, publishRequest.getAcceptedState().version());</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>对每个目标节点二次commit响应做了如下操作:<br>1.1 若目标节点就是本主master节点,那么仅标记localNodeAckEvent状态为done(后面会用)<br>1.2 若目标节点是非主master节点,则更新本主master维护的其他节点的ClusterState version(若落后严重,会主动被master剔除集群,后面会介绍)。</p>
<p>2.调用onPossibleCompletion()检查整个publish是否完成了。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br></pre></td><td class="code"><pre><span class="line">private void onPossibleCompletion() {</span><br><span class="line"> // 若超时30s(cluster.publish.timeout),就cancelled=true,置为失败</span><br><span class="line"> if (cancelled == false) { </span><br><span class="line"> for (final PublicationTarget target : publicationTargets) { // 遍历每一个target</span><br><span class="line"> // 只要还有一个没有第二次确认完成,就退出</span><br><span class="line"> if (target.isActive()) { </span><br><span class="line"> return;</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> } </span><br><span class="line"> // 此时1.要是cancelled=true;2.要么cancelled=false, 但是所有目标节点publish状态已经done->applyCommitRequest已经发送请求。</span><br><span class="line"> if (applyCommitRequest.isPresent() == false) {</span><br><span class="line"> //还没有任何节点进行第二次commit:超时导致的失败</span><br><span class="line"> logger.debug("onPossibleCompletion: [{}] commit failed", this);</span><br><span class="line"> assert isCompleted == false;</span><br><span class="line"> isCompleted = true;</span><br><span class="line"> onCompletion(false)</span><br><span class="line"> return;</span><br><span class="line"> } </span><br><span class="line"> isCompleted = true;</span><br><span class="line"> //全部完成了才会去调用 这里还有大作用,会去调用CoordinatorPublication.applyClusterState()</span><br><span class="line"> onCompletion(true); </span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>主要做了如下检查:<br>2.1 检查cancelled是否置为失败,若未失败,且还有至少一个目标节点未完成二次commit,那么就退出等待。<br>2.2 若applyCommitRequest为空,说明是超时导致的失败,代表整个publish已经失败的完成了,会进入onCompletion()。<br>2.3 此时所有节点已经完成二次commit响应,进入onCompletion()</p>
<p>再继续看下主master调用onCompletion()做了哪些事情:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br></pre></td><td class="code"><pre><span class="line">//master节点上,所有任务已经完成(isCompleted=true),可能任务全部失败了(超时30会设置),也可能任务全部成功了</span><br><span class="line">protected void onCompletion(boolean committed) { </span><br><span class="line"> // master本身完成二次确认</span><br><span class="line"> localNodeAckEvent.addListener(new ActionListener<Void>() { </span><br><span class="line"> @Override</span><br><span class="line"> public void onResponse(Void ignore) {</span><br><span class="line"> receivedJoinsProcessed = true;</span><br><span class="line"> // 也是比较重要的, master合并元数据进本身的ClusterState。数据节点合并是在收到commit请求后就合并(详见Coordinator.handleApplyCommit()函数)</span><br><span class="line"> clusterApplier.onNewClusterState(Coordinator.CoordinatorPublication.this.toString(), () -> applierState, // 进去会去调用</span><br><span class="line"> new ClusterApplier.ClusterApplyListener() {</span><br><span class="line"> @Override</span><br><span class="line"> public void onSuccess(String source) { // 本地master更新后</span><br><span class="line"> synchronized (mutex) {</span><br><span class="line"> currentPublication = Optional.empty();</span><br><span class="line"> // trigger term bump if new term was found during publication</span><br><span class="line"> updateMaxTermSeen(getCurrentTerm());</span><br><span class="line"></span><br><span class="line"> if (mode == Coordinator.Mode.LEADER) {</span><br><span class="line"> .......</span><br><span class="line"> }</span><br><span class="line"> // 开始对滞后的节点进行处理</span><br><span class="line"> lagDetector.startLagDetector(publishRequest.getAcceptedState().version()); </span><br><span class="line"> logIncompleteNodes(Level.WARN); // 超时30s的节点报警</span><br><span class="line"> }</span><br><span class="line"> cancelTimeoutHandlers(); // 取消超时</span><br><span class="line"> ackListener.onNodeAck(getLocalNode(), null); // 本节点也完成了</span><br><span class="line"> publishListener.onResponse(null);</span><br><span class="line"> }</span><br><span class="line"> });</span><br><span class="line"> }</span><br><span class="line"> }, EsExecutors.newDirectExecutorService(), transportService.getThreadPool().getThreadContext());</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>主要做了如下事情:若主master收到本节点的二次commti响应(设置localNodeAckEvent为done),那么<br>1.调用ClusterApplierService.onNewClusterState将新的ClusterState融合到本地节点中(参考<a href="https://kkewwei.github.io/elasticsearch_learning/2020/08/04/ES7-9-1-publish%E5%8E%9F%E7%90%86%E8%AF%A6%E8%A7%A3/#%E7%9B%AE%E6%A0%87%E8%8A%82%E7%82%B9%E6%8E%A5%E6%94%B6%E5%88%B0master%E5%8F%91%E9%80%81%E7%9A%84%E4%BA%8C%E6%AC%A1commit%E8%AF%B7%E6%B1%82">data融合新的全局元数据</a>)<br>2.开始针对本地维护的数据节点ClusterState version,若再超时时间外仍然低于当前同步的version,则将数据节点从集群中剔除,超时时间90s(由cluster.follower_lag.timeout参数决定)</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br></pre></td><td class="code"><pre><span class="line">void checkForLag(final long version) {</span><br><span class="line"> if (appliedStateTrackersByNode.get(discoveryNode) != this) {</span><br><span class="line"> logger.trace("{} no longer active when checking version {}", this, version);</span><br><span class="line"> return;</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> long appliedVersion = this.appliedVersion.get();</span><br><span class="line"> // 落后</span><br><span class="line"> logger.warn(</span><br><span class="line"> "node [{}] is lagging at cluster state version [{}], although publication of cluster state version [{}] completed [{}] ago",</span><br><span class="line"> discoveryNode, appliedVersion, version, clusterStateApplicationTimeout);</span><br><span class="line"> onLagDetected.accept(discoveryNode); // 在 Coordinator 构造函数中。惩罚将节点脱离集群,实际调用removeNode()函数</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>会打印如下日志:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">[2020-08-10T14:12:24,781][WARN ][o.e.c.c.LagDetector] [master] node [node1] is lagging at cluster state version [483037], although publication of cluster state version [483038] completed [1.5m] ago</span><br></pre></td></tr></table></figure>
<p>然后直接调用Coordinator.removeNode()再次广播全局元数据。<br>3.打印publish超时未完成日志。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">[2020-08-10T12:12:24,781][WARN ][o.e.c.c.C.CoordinatorPublication] [master1] after [30.1s] publication of cluster state version [483038] is still waiting for {node1}[SENT_APPLY_COMMIT], {node2} [SENT_APPLY_COMMIT]</span><br></pre></td></tr></table></figure>
<h1 id="总结"><a href="#总结" class="headerlink" title="总结"></a>总结</h1><p>master广播全过程分为第一次广播+第二次commit请求,只有过半master节点响应才能继续第二次广播。在30s超时时间后,主动设置publish状态为true, 在规定时间内元数据更新较慢的节点,master会主动将其剔除集群。</p>
</div>
<div class="article-info article-info-index">
<div class="article-category tagcloud">
<i class="icon-book icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="/elasticsearch_learning/categories/Elasticsearch//" class="article-tag-list-link color4">Elasticsearch</a>
</li>
</ul>
</div>
<p class="article-more-link">
<a class="article-more-a" href="/elasticsearch_learning/2020/08/04/ES7-9-1-publish原理详解/">展开全文 >></a>
</p>
<div class="clearfix"></div>
</div>
</div>
</article>
<aside class="wrap-side-operation">
<div class="mod-side-operation">
<div class="jump-container" id="js-jump-container" style="display:none;">
<a href="javascript:void(0)" class="mod-side-operation__jump-to-top">
<i class="icon-font icon-back"></i>
</a>
<div id="js-jump-plan-container" class="jump-plan-container" style="top: -11px;">
<i class="icon-font icon-plane jump-plane"></i>
</div>
</div>
</div>
</aside>
<article id="post-Lucene8-2-0底层架构-tim-tip词典结构原理研究" class="article article-type-post article-index" itemscope itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/elasticsearch_learning/2020/02/28/Lucene8-2-0底层架构-tim-tip词典结构原理研究/">Lucene8.2.0底层架构-tim/tip词典结构原理研究</a>
</h1>
<a href="/elasticsearch_learning/2020/02/28/Lucene8-2-0底层架构-tim-tip词典结构原理研究/" class="archive-article-date">
<time datetime="2020-02-28T05:30:35.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2020-02-28</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
<p>Lucene中主要有两类倒排索引结构, 一种是词典结构, 涉及tim、tip、doc、pos。另外一种就是词典向量,统计的是单个词在文档内的,涉及tvd,tvm(参考<a href="https://kkewwei.github.io/elasticsearch_learning/2020/03/02/Lucene8-2-0%E5%BA%95%E5%B1%82%E6%9E%B6%E6%9E%84-tvd-tvm%E8%AF%8D%E5%85%B8%E5%90%91%E9%87%8F%E7%BB%93%E6%9E%84%E7%A0%94%E7%A9%B6/">Lucene8.2.0底层架构-tvd/tvm词典向量结构研究</a>), 这者统计的信息大体相同,这两个类都继承自<code>TermsHashPerField</code>,都是对segment内相同域名所有文档共享这两个类。<br>前者由<code>FreqProxTermsWriterPerField</code>构建, 后者由<code>TermVectorsConsumerPerField</code>构建。两者之间结构相似,前者统计单个词在segment内所有文档当前域的词频等,后者统计单个词在当前文档当前域的词频等。先放一张图让大家对这两个对象有个大致的了解:<br><img src="https://kkewwei.github.io/elasticsearch_learning/img/lucene_tim1.png" height="500" width="400"><br>可以看到,将两者连接起来的是bytePool(结构可参考<a href="https://kkewwei.github.io/elasticsearch_learning/2019/10/06/Lucene8-2-0%E5%BA%95%E5%B1%82%E6%9E%B6%E6%9E%84-ByteBlockPool%E7%BB%93%E6%9E%84%E5%88%86%E6%9E%90/">Lucene8.2.0底层架构-ByteBlockPool结构分析</a>),该对象就是存放的term内容,在词典和TermVector构建之间共享以节约内存使用,不过前者产生的termId在当前segment相同域内唯一的,而后者仅仅在文档该域内唯一。<br>Lucene查询中使用最多的就是词典结构, 根据term查询在哪些文档中存在, 也被称为倒排索引, 倒排索引结构如下:<br><img src="https://kkewwei.github.io/elasticsearch_learning/img/lucene_tim2.png" height="350" width="400"><br>由图可知,只要知道termId,我们就可以很好地知道该term在每个document每个域的词频,位置,offset等信息,本文就以词典构建过程来进行深入研究。</p>
<h1 id="词典在内存中构建"><a href="#词典在内存中构建" class="headerlink" title="词典在内存中构建"></a>词典在内存中构建</h1><p>在对字典字段设置时, 可以进行如下设置:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre></td><td class="code"><pre><span class="line">FieldType fieldType = new FieldType();</span><br><span class="line">// 对term建立倒排索引存储的数据</span><br><span class="line">fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);</span><br><span class="line">fieldType.setTokenized(true);//分词</span><br><span class="line">fieldType.setStoreTermVectors(true);//分词</span><br><span class="line">fieldType.setOmitNorms(true);//分词</span><br><span class="line">fieldType.setStoreTermVectorOffsets(true);//分词</span><br><span class="line">fieldType.setStoreTermVectorPayloads(true);//分词</span><br><span class="line">fieldType.setStoreTermVectorPositions(true);//分词</span><br></pre></td></tr></table></figure>
<p>1.setIndexOptions的参数含义:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre></td><td class="code"><pre><span class="line">// 不建立词的倒排索引</span><br><span class="line">NONE,</span><br><span class="line">// 仅仅对词的建立索引结构</span><br><span class="line">DOCS,</span><br><span class="line">// 在termVector中仅存储词频</span><br><span class="line">DOCS_AND_FREQS,</span><br><span class="line">// 在termVector中存储词频和词position</span><br><span class="line">DOCS_AND_FREQS_AND_POSITIONS,</span><br><span class="line">// 在termVector中存储词频和词position和offset</span><br><span class="line">DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS,</span><br></pre></td></tr></table></figure>
<p>这些参数将作用于词典中, doc仅仅是建立termId->doc的映射; freq还统每个域中每个单词的频率, postion统计了每个单词在每个域中的位置, offset统计了每个单词在每个域中的偏移量。<br>2.setStoreTermVector…()主要作用在TermVector, 只有当setIndexOptions设置为非NONE, 设置这些参数才有效。<br>这两类参数含义很像, 前一个作用于词典的信息统计, 词典是全局型的; 而后一个设置作用于termVector, 统计的是单个文档单个域内的。</p>
<p>我们从<code>DefaultIndexingChain.processField()</code>开始讲解, 首先检查该字段设置:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br></pre></td><td class="code"><pre><span class="line">if (fieldType.indexOptions() != IndexOptions.NONE) {</span><br><span class="line"> // 每个字段只会保存一个 PerField 对象。</span><br><span class="line"> fp = getOrAddField(fieldName, fieldType, true);</span><br><span class="line"> // 这个文档中这个域不是重复写入</span><br><span class="line"> boolean first = fp.fieldGen != fieldGen;</span><br><span class="line"> // 创建倒排索引</span><br><span class="line"> fp.invert(field, first);</span><br><span class="line"> // 域是第一次写入</span><br><span class="line"> if (first) {</span><br><span class="line"> // 这里才是真正存放,和fieldHash存放的是一个对象。真正统计的是当前文档所有的域。</span><br><span class="line"> fields[fieldCount++] = fp;</span><br><span class="line"> fp.fieldGen = fieldGen;</span><br><span class="line"> }</span><br><span class="line">} else {</span><br><span class="line"> // 若我们fieldType=NONE, 那么关于存储termVector设置都是不合理的。</span><br><span class="line"> verifyUnIndexedFieldType(fieldName, fieldType);</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>该函数主要做了两件事:<br>1.检查该字段是否已经写入过。<br>2.调用<code>fp.inver</code>对该字段进行分词。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br></pre></td><td class="code"><pre><span class="line"> public void invert(IndexableField field, boolean first) throws IOException { // PerFieldl里面开始</span><br><span class="line"> if (first) { // 第一次该字段被写入</span><br><span class="line"> invertState.reset(); // 每次写入一个新的文档,这里都会被清空</span><br><span class="line"> }</span><br><span class="line"> IndexableFieldType fieldType = field.fieldType();</span><br><span class="line"></span><br><span class="line"> final boolean analyzed = fieldType.tokenized() && docState.analyzer != null; // 是否分词</span><br><span class="line"> // 对域的值进行了分词</span><br><span class="line"> try (TokenStream stream = tokenStream = field.tokenStream(docState.analyzer, tokenStream)) {</span><br><span class="line"> // 针对TermVectorsConsumerPerField里面termvector参数进行设置</span><br><span class="line"> termsHashPerField.start(field, first);</span><br><span class="line"> while (stream.incrementToken()) {</span><br><span class="line"> // 每个词增量为1</span><br><span class="line"> // 对position和offset进行统计。</span><br><span class="line"> invertState.position += posIncr; // 每个词新增1</span><br><span class="line"> invertState.lastPosition = invertState.position;</span><br><span class="line"> // 词的起始位置</span><br><span class="line"> int startOffset = invertState.offset + invertState.offsetAttribute.startOffset();</span><br><span class="line"> // 这个词的末尾</span><br><span class="line"> int endOffset = invertState.offset + invertState.offsetAttribute.endOffset();</span><br><span class="line"> // 该文档该域上一个词的截止为止</span><br><span class="line"> invertState.lastStartOffset = startOffset;</span><br><span class="line"> // 真正对词进行</span><br><span class="line"> termsHashPerField.add();</span><br><span class="line"> }</span><br><span class="line"> stream.end();</span><br><span class="line"> }</span><br><span class="line"> if (analyzed) { // 若分词的话,</span><br><span class="line"> invertState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name);</span><br><span class="line"> invertState.offset += docState.analyzer.getOffsetGap(fieldInfo.name);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>该函数主要做了如下事情:<br>1.置空invertState统计, 每个文档每个域将分别统计。<br>2.统计该词的相关信息, 放入invertState<br>3.调用<code>termsHashPerField.add()</code>开始对一个词读取相应索引结构。<br>4.若分词的话, 需要对每个域值增加position和offset统计。对于域multiValue字段使用,用于设置多值数据间的间隔。<br>我们需要重点关注第三步:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br></pre></td><td class="code"><pre><span class="line">void add() throws IOException {</span><br><span class="line"> // 获取当前term的termId, 此时获取的termId在semgment内该域都是唯一的</span><br><span class="line"> int termID = bytesHash.add(termAtt.getBytesRef()); </span><br><span class="line"> // 该term第一次写入</span><br><span class="line"> if (termID >= 0) {</span><br><span class="line"> // intPool的当前buffer不够用就申请新的byte[]</span><br><span class="line"> if (numPostingInt + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE) {</span><br><span class="line"> intPool.nextBuffer();</span><br><span class="line"> }</span><br><span class="line"> // intPool的当前buffer不够用就申请新的byte[]</span><br><span class="line"> if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {</span><br><span class="line"> bytePool.nextBuffer();</span><br><span class="line"> }</span><br><span class="line"> //此处 streamCount 为 2,表明在intPool中,一个词将占用2位,一个是指向该词的bytePool中docId&freq存储位置,一个指向该词的bytePool中position&offset存储位置。</span><br><span class="line"> intUptos = intPool.buffer;</span><br><span class="line"> // int当前buffer内的可分配位置</span><br><span class="line"> intUptoStart = intPool.intUpto; </span><br><span class="line"> // 先在intPool中申请2个位置</span><br><span class="line"> intPool.intUpto += streamCount; </span><br><span class="line"> // 第i个词在intintPool中(为了快速找到该词的int位置绝对起始起始位置</span><br><span class="line"> postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset; </span><br><span class="line"> //在 bytePool 中分配两个空间,一个放 freq 信息,一个放 prox 信息的。</span><br><span class="line"> for(int i=0;i<streamCount;i++) {</span><br><span class="line"> // 返回的是该slice的相对起始位置,存放docId&freq</span><br><span class="line"> final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);</span><br><span class="line"> // 存放position&offset</span><br><span class="line"> intUptos[intUptoStart+i] = upto + bytePool.byteOffset; </span><br><span class="line"> }</span><br><span class="line"> // 把该termId的使用byteBlockPool起始位置给记录下来</span><br><span class="line"> postingsArray.byteStarts[termID] = intUptos[intUptoStart]; </span><br><span class="line"> // 开始向两个slice中存放该词的docId&freq和position&offset。</span><br><span class="line"> newTerm(termID);</span><br><span class="line"> } else { </span><br><span class="line"> // 这说明该词已经出现过一次, 返回该词的termId, 使用小技巧,将返回termId编码为负值</span><br><span class="line"> termID = (-termID)-1; </span><br><span class="line"> // 返回该词在在intPool中的起始位置</span><br><span class="line"> int intStart = postingsArray.intStarts[termID];</span><br><span class="line"> // intPool中当前使用的buffer的相对起始位置</span><br><span class="line"> intUptos = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];</span><br><span class="line"> intUptoStart = intStart & IntBlockPool.INT_BLOCK_MASK;</span><br><span class="line"> // 向两个slice中追加该词的docId&freq和position&offset。</span><br><span class="line"> addTerm(termID);</span><br><span class="line"> }</span><br><span class="line"> // 开始统计termVector需要的bytePool中docId&freq和bytePool中position&offset</span><br><span class="line"> if (doNextCall) { </span><br><span class="line"> nextPerField.add(postingsArray.textStarts[termID]); </span><br><span class="line"> } </span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>该函数对当前termId, 做了如下事情:<br>1.通过<code>bytesHash.add</code>确定该词的termId, termId在Segment内全局唯一。<br>2.若该term是第一次写入该segment,那么在intPool中申请2个byte,在bytePool中申请两个slice。int的两个byte作为指针,指向申请的两个slice。然后调用<code>addTerm</code>, 统计该termId的docId&freq和position&offset, 分别放入这两个slice中,结构将在后图展示。<br>3.若该term的termId已经存在, 那么调用<code>newTerm</code>统计信息,分别放入以上两个slice中。<br>4.调用<code>nextPerField.add</code>构建termVector的索引结构, 将在<a href="https://kkewwei.github.io/elasticsearch_learning/2020/03/02/Lucene8-2-0%E5%BA%95%E5%B1%82%E6%9E%B6%E6%9E%84-tvd-tvm%E8%AF%8D%E5%85%B8%E5%90%91%E9%87%8F%E7%BB%93%E6%9E%84%E7%A0%94%E7%A9%B6/">Lucene8.2.0底层架构-tvd/tvm词典向量结构研究</a>中重点介绍。</p>
<p>我们首先看下<code>bytesHash.add</code>是如何做到存储termValue的。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br></pre></td><td class="code"><pre><span class="line"> final int length = bytes.length;</span><br><span class="line"> // 使用探针法, 直到找到当前要存储的bytes所在的有效槽位</span><br><span class="line"> final int hashPos = findHash(bytes);</span><br><span class="line"> int e = ids[hashPos]; </span><br><span class="line"> //如果为-1,则是新的term</span><br><span class="line"> if (e == -1) {</span><br><span class="line"> final byte[] buffer = pool.buffer;</span><br><span class="line"> final int bufferUpto = pool.byteUpto;// 获取内存池的起始可用位置</span><br><span class="line"> count++;</span><br><span class="line"> // 记录对应termId在bytePool中的内容。freqProxPostingsArray.textStarts和bytesStart是同一个对象</span><br><span class="line"> bytesStart[e] = bufferUpto + pool.byteOffset;</span><br><span class="line"> // 在pool首先存储len(bytes),在存储值</span><br><span class="line"> if (length < 128) {</span><br><span class="line"> buffer[bufferUpto] = (byte) length;</span><br><span class="line"> pool.byteUpto += length + 1;</span><br><span class="line"> assert length >= 0: "Length must be positive: " + length;</span><br><span class="line"> System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 1,</span><br><span class="line"> length);</span><br><span class="line"> } else {</span><br><span class="line"> buffer[bufferUpto] = (byte) (0x80 | (length & 0x7f));</span><br><span class="line"> buffer[bufferUpto + 1] = (byte) ((length >> 7) & 0xff);</span><br><span class="line"> pool.byteUpto += length + 2;</span><br><span class="line"> System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2,</span><br><span class="line"> length);</span><br><span class="line"> }</span><br><span class="line"> ids[hashPos] = e; </span><br><span class="line"> return e;</span><br><span class="line"> }</span><br><span class="line"> // 该term已经存在,直接返回termId</span><br><span class="line"> return -(e + 1);</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>本函数主要是将term存储到pool中, 存储时通过hash快速term存放的位置。</p>
<p>我们再看下<code>newTerm</code>怎么统计新产生的term:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br></pre></td><td class="code"><pre><span class="line">void newTerm(final int termID) {</span><br><span class="line"> final FreqProxPostingsArray postings = freqProxPostingsArray; //</span><br><span class="line"></span><br><span class="line"> postings.lastDocIDs[termID] = docState.docID;</span><br><span class="line"> //DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS递进</span><br><span class="line"> if (!hasFreq) { </span><br><span class="line"> ......</span><br><span class="line"> } else { </span><br><span class="line"> // 统计词频</span><br><span class="line"> postings.lastDocCodes[termID] = docState.docID << 1;</span><br><span class="line"> postings.termFreqs[termID] = getTermFreq();</span><br><span class="line"> // 存储offset, position到第第二个slice中</span><br><span class="line"> if (hasProx) { </span><br><span class="line"> // 向stream1中存储了proxCode</span><br><span class="line"> writeProx(termID, fieldState.position); </span><br><span class="line"> if (hasOffsets) {</span><br><span class="line"> // 向stream1中存储了offserCode,</span><br><span class="line"> writeOffsets(termID, fieldState.offset);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> fieldState.maxTermFrequency = Math.max(postings.termFreqs[termID], fieldState.maxTermFrequency);</span><br><span class="line"> }</span><br><span class="line"> fieldState.uniqueTermCount++;</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>该函数主要统计了该词如下信息:<br>1.统计position和offset。<br>2.这里并没有立即统计该词的词频, 词频必须等该文档该域写完后才能统计。(只有当相同词的docId发生变化了才开始统计)</p>
<p>我们再看下如何对一个已经存在的term统计freq、position及offset:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br></pre></td><td class="code"><pre><span class="line">void addTerm(final int termID) {</span><br><span class="line"> final FreqProxPostingsArray postings = freqProxPostingsArray;</span><br><span class="line"> if (!hasFreq) {//不需要词频</span><br><span class="line"> ......</span><br><span class="line"> } else if (docState.docID != postings.lastDocIDs[termID]) { </span><br><span class="line"> // 上一个文档的该域所有term已经处理完了</span><br><span class="line"> if (1 == postings.termFreqs[termID]) { // 词频为1</span><br><span class="line"> writeVInt(0, postings.lastDocCodes[termID]|1); </span><br><span class="line"> } else {</span><br><span class="line"> writeVInt(0, postings.lastDocCodes[termID]);</span><br><span class="line"> writeVInt(0, postings.termFreqs[termID]);</span><br><span class="line"> }</span><br><span class="line"> //初始化当前文档的词频</span><br><span class="line"> postings.termFreqs[termID] = getTermFreq(); </span><br><span class="line"> fieldState.maxTermFrequency = Math.max(postings.termFreqs[termID], fieldState.maxTermFrequency);</span><br><span class="line"> // 这次出现文档-上次出现文档</span><br><span class="line"> postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;</span><br><span class="line"> postings.lastDocIDs[termID] = docState.docID;</span><br><span class="line"> if (hasProx) {</span><br><span class="line"> // 保存termId</span><br><span class="line"> writeProx(termID, fieldState.position);</span><br><span class="line"> if (hasOffsets) {</span><br><span class="line"> // 保存offset</span><br><span class="line"> postings.lastOffsets[termID] = 0;</span><br><span class="line"> writeOffsets(termID, fieldState.offset);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> fieldState.uniqueTermCount++;</span><br><span class="line"> } else { </span><br><span class="line"> // 当前文档当前域的term还没有处理完</span><br><span class="line"> postings.termFreqs[termID] = Math.addExact(postings.termFreqs[termID], getTermFreq()); </span><br><span class="line"> // 增加词频</span><br><span class="line"> fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, postings.termFreqs[termID]);</span><br><span class="line"> if (hasProx) { </span><br><span class="line"> // 继续统计post</span><br><span class="line"> writeProx(termID, fieldState.position-postings.lastPositions[termID]);</span><br><span class="line"> if (hasOffsets) {</span><br><span class="line"> // 统计offser</span><br><span class="line"> writeOffsets(termID, fieldState.offset);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>此时bytePool中已经存在termId:<br>1.这里会检查该词最后一次出现的DocId是否和本文档一致 若不一直,则说明上一个文档该term已经处理完了,需要将文档DocId,词频保存带第一个slice中。可能这里有疑问,为啥该词若是第一次出现时不需要检文档Id是否发生了变化,因为该词第一次出现时,上一个文档更不存在。<br>2.将词position&offset保存到第二个slice中。</p>
<p>最终建立的内存结构如下所示:<br><img src="https://kkewwei.github.io/elasticsearch_learning/img/lucene_tim3.png" height="400" width="400"><br>freqProxPostingsArray里面数组下标就是termId, 上图展示的是termId=1的term 倒排索引存放情况。</p>
<h1 id="flush到文件中"><a href="#flush到文件中" class="headerlink" title="flush到文件中"></a>flush到文件中</h1><p>flush到文件中指的是形成一个segment,触发条件有两个(同<a href="https://kkewwei.github.io/elasticsearch_learning/2019/10/29/Lucenec%E5%BA%95%E5%B1%82%E6%9E%B6%E6%9E%84-fdt-fdx%E6%9E%84%E5%BB%BA%E8%BF%87%E7%A8%8B/#%E5%88%B7%E5%88%B0fdx%E6%96%87%E4%BB%B6">fdx</a>,<a href="https://kkewwei.github.io/elasticsearch_learning/2019/11/15/Lucene%E5%BA%95%E5%B1%82%E6%9E%B6%E6%9E%84-dvm-dvm%E6%9E%84%E5%BB%BA%E8%BF%87%E7%A8%8B/#%E5%88%B7%E6%96%B0%E5%88%B0%E6%96%87%E4%BB%B6">dvm</a>一样):<br>1.lucene建立的索引结构占用内存或者缓存文档书超过阈值。该check会在每次索引完一个文档后。<br>2.用户主动调用indexWriter.flush()触发。</p>
<p>两种情况最终都会跑到<code>BlockTreeTermsWriter.write</code>:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br></pre></td><td class="code"><pre><span class="line">public void write(Fields fields, NormsProducer norms) throws IOException {</span><br><span class="line"> String lastField = null;</span><br><span class="line"> // 遍历该segment内的每个域</span><br><span class="line"> for(String field : fields) {</span><br><span class="line"> lastField = field;</span><br><span class="line"> Terms terms = fields.terms(field); </span><br><span class="line"> if (terms == null) {</span><br><span class="line"> continue;</span><br><span class="line"> }</span><br><span class="line"> // 遍历FreqProxTermsWriterPerField里面每个termId使用的,读取term的顺序前按照字符串排好序了</span><br><span class="line"> TermsEnum termsEnum = terms.iterator();</span><br><span class="line"> // 一个域单独产生一个</span><br><span class="line"> TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field)); </span><br><span class="line"> while (true) {</span><br><span class="line"> // 这里会从FreqProxPostingsArray.textStarts循环遍历每一个termId。</span><br><span class="line"> BytesRef term = termsEnum.next(); </span><br><span class="line"> if (term == null) {</span><br><span class="line"> break;</span><br><span class="line"> }</span><br><span class="line"> // 将该term加入词典及建立字典索引结构。</span><br><span class="line"> termsWriter.write(term, termsEnum, norms); </span><br><span class="line"> }</span><br><span class="line"> // 完成field 的构建。每个单词一个finish</span><br><span class="line"> termsWriter.finish();</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>该函数首先遍历该segment中每个域里所有的词,将所有词的倒排结构存放在doc文件,然后调用<code>termsWriter.write</code>建立词典结构,最后调用<code>termsWriter.finish()</code>将词典索引结构FST放入tip文件中。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br></pre></td><td class="code"><pre><span class="line">public void write(BytesRef text, TermsEnum termsEnum, NormsProducer norms) throws IOException {</span><br><span class="line"> // 将该term的倒排索引读取出来并建立索引结构</span><br><span class="line"> BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen, norms); // 针对的是一个词</span><br><span class="line"> if (state != null) {</span><br><span class="line"> // 将当前词加入词典中</span><br><span class="line"> pushTerm(text); </span><br><span class="line"> PendingTerm term = new PendingTerm(text, state);</span><br><span class="line"> //当前term加入待索引列表</span><br><span class="line"> pending.add(term);</span><br><span class="line"> // 该词在多少文档中出现过</span><br><span class="line"> sumDocFreq += state.docFreq;</span><br><span class="line"> //该词总的出现频次</span><br><span class="line"> sumTotalTermFreq += state.totalTermFreq; </span><br><span class="line"> numTerms++; //</span><br><span class="line"> if (firstPendingTerm == null) {</span><br><span class="line"> // 写入的第一个词</span><br><span class="line"> firstPendingTerm = term;</span><br><span class="line"> }</span><br><span class="line"> lastPendingTerm = term;</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>