-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFinanOctopus.py
1076 lines (962 loc) · 50.9 KB
/
FinanOctopus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
import numpy as np
import re
import jieba
import jieba.posseg as pseg
# 分词,POS标签,预处理
def sep_flag_pre(sentence:str):
# preprocessing
# “按。。。的”条件语句去掉
while sentence.count('按') != 0 and sentence.count('的') != 0:
sentence_before = sentence[:sentence.index('按')]
terminalindex = len(sentence_before) + sentence[sentence.index('按'):].index('的')+1
sentence_after = sentence[terminalindex:]
sentence = sentence_before + sentence_after
print(f'[method:sep_flag_pre]已删除条件划分句式中条件')
# 括号内数值会干扰单值特征的时间判断
while sentence.count('(') != 0 and sentence.count(')') != 0:
sentence_before = sentence[:sentence.index('(')]
terminalindex = len(sentence_before) + sentence[sentence.index('('):].index(')')+1
sentence_after = sentence[terminalindex:]
sentence = sentence_before + sentence_after
print(f'[method:sep_flag_pre]已删除句式中括号')
# “注。。数字”干扰时间项的获取
while sentence.count('注') != 0 and sentence.count(':') != 0:
sentence_before = sentence[:sentence.index('注')]
terminalindex = len(sentence_before) + sentence[sentence.index('注'):].index(':')+1
sentence_after = sentence[terminalindex:]
sentence = sentence_before + sentence_after
print(f'[method:sep_flag_pre]已删除句式中注释前缀')
sentence = sentence.replace(' ','').strip()
sentence = sentence.replace(';',',')
sentence = sentence.replace(':',',')
if sentence[-1] == '。':#去除句尾的句号
sentence = sentence[:-1]
# 去除句中的句号
sentence = sentence.replace('。',',')
if sentence[-1] == ',':#去除句尾的逗号
sentence = sentence[:-1]
#找到数值中间的逗号
patcommainnumber = re.compile('(\d,\d)')
# matcher = re.findall(patcommainnumber,sentence)
#消除数字中的逗号
while True:
m = re.search(patcommainnumber,sentence)
if m == None:
break
s,e = m.span()
sentence = sentence[:s+1] + sentence[e-1:]
# 把不能识别为数值的数值转化为能识别的数值
number_matches = re.findall(re.compile('(-?\d+(%|万元|亿元|元|万股|股))'), sentence)
while len(number_matches) != 0:
last_index = 0
for number_match in number_matches:
number_plus_unit = number_match[0]
numberunitindex = sentence.index(number_plus_unit)
while numberunitindex < last_index:
numberunitindex = last_index + sentence[last_index:].index(number_plus_unit)
last_index = numberunitindex
if sentence[numberunitindex-1] != '.':
print(f'[method:sep_flag_pre]数值{number_plus_unit}需转化为可识别数值')
# sentence = sentence.replace(number_plus_unit, number_plus_unit.replace(number_match[1],'')+'.00'+number_match[1])
sentence = sentence[:numberunitindex] + number_plus_unit.replace(number_match[1],'')+'.00'+number_match[1] + sentence[numberunitindex+len(number_plus_unit):]
number_matches = re.findall(re.compile('(-?\d+(%|万元|亿元|元|万股|股))'), sentence)
break
number_matches.remove(number_match)
# 单值子句要为没有谓语的句子的数值前加上谓语
s_p = sentence.split(',')
ms = [re.findall(pat_value_word, p) for p in s_p]
for mi in range(len(ms)):
if ms[mi]!=[]:
last_index = 0 # last_index的作用是保证index是按递增查找的
for value_tuple in ms[mi]:
single_value = value_tuple[0]
value_index = s_p[mi].index(single_value)# 这种处理是不可靠的
while value_index < last_index:
value_index = last_index + s_p[mi][last_index:].index(single_value)
last_index = value_index
post_value_index = clean_subject(s_p[mi]).index(single_value)
word_before_value = s_p[mi][value_index-1]
# 占比特征句式没有“的”
if (s_p[mi][0] == '占' or s_p[mi].count('占') != 0) and (s_p[mi].count('比重') != 0 or s_p[mi].count('比例') != 0) and s_p[mi].count('的') == 0:
print(f'[method:sep_flag_pre]已为占比句式[{s_p[mi]}]完善占比形式1')
s_p[mi] = s_p[mi].replace('比重', '的比重').replace('比例', '的比例')
# 占比类句式无“比重”/“比例” 无法形成占比特征的匹配
if (s_p[mi][0] == '占' or s_p[mi].count('占') != 0) and (s_p[mi].count('比重') == 0 and s_p[mi].count('比例') == 0):
print(f'[method:sep_flag_pre]已为占比句式[{s_p[mi]}]完善占比形式2')
s_p[mi] = s_p[mi].replace(single_value, '的比例为'+single_value)
s_p[mi] = s_p[mi].replace('的的','的')
continue
if word_before_value not in ['为','、','和','的','以及','及'] and post_value_index!=0:
print(f'[method:sep_flag_pre]已为无谓语句式[{s_p[mi]}]添加谓词前置')
s_p[mi] = s_p[mi].replace(single_value, '为'+single_value)
sentence = ','.join(s_p)
# 去除干扰模式的单位
sentence = sentence.replace('次/年','')
# 去除干扰模式的时间词
sentence = sentence.replace('当前','')
# 凸显模糊时间的表达
sentence = sentence.replace('本报告期','本期')
# 去除确定时间的干扰
sentence = sentence.replace('截止','')
sentence = sentence.replace('截至','')
# 去除单值特征的干扰
sentence = sentence.replace('达','')
# 去除变动特征的干扰
sentence = sentence.replace('由','从')
# get pos tags out of sub sentences
parse_words=[]
parse_wordsequences=[]
parse_flagsequences=[]
sub_sentences = sentence.split(',')
for sub in sub_sentences:
words = pseg.cut(sub)
wordl = '|'.join([x+'【'+y+'】' for x,y in list(words) if x != '款'])# 去除非时间flag的一场影响字
parse_words.append(wordl)
wordsequence='|'.join([i[:i.index('【')] for i in wordl.split('|')])
parse_wordsequences.append(wordsequence)
# 为了与wordsequence中的下标完全对应
flagsequence='|'.join([i[i.index('【')+1:i.index('】')] for i in wordl.split('|')])
parse_flagsequences.append(flagsequence)
return ' '.join(parse_flagsequences), ' '.join(parse_wordsequences), sentence
# 是否需要加入dp前缀,待后设计决定
pat_multi_value = re.compile('((x?mm?x)*x?mm?cx?mm?)')
pat_multi_rate = re.compile('((x?mxx)*x?mxcx?mx)')# 这种情况可能有潜在公式 需再度判断句子是否含有占比前缀(v*ujn)
# 单值赋值
pat_single_value = re.compile('([lvnt]*p((x?mm)|(x?mx)))|(((x?mm)|(x?mx))p[lvnt]*)')
# 带主语的多值
pat_multi_subject_value = re.compile('(([lvnthujxdrm]*p?mmx)*[lvnthujxdrm]*p?mmc[lvnthujxdrm]*p?mm)')
# 是否需要加入dp后缀,待后设计决定
# 占比句子特征:*占*的比重
pat_take_percentage = re.compile('(v[blvnthujxdrm]*ujn)')
# 假设:值都是有小数点的,此正则针对的是词
pat_value_word = re.compile('(-?\d+\.\d{1,3}%?(万元)?(亿元)?(元)?(万股)?(股)?)')
# 需要转义时间句子特征
cover_time_sample_flags=['lcmt','lcd','fmcmt','fmcd','tcd','tcmt','fmt','fm','nrt','nf','t']
cover_time_regx='|'.join(cover_time_sample_flags)
pat_cover_time = re.compile(cover_time_regx) # 对应关系后续针对性解析
# 不需转义时间句子特征
# 样例
uncover_time_sample=['2019年1-6月','2016','2017-12-31','2017','2019-6-30','2018年度', '2017年度', '20190106', '2016年度', '2018-12-31', '2016-12-31', '2018','2018 年', '2017年', '2019年6月30日', '2016年', '2018年', '2007年7月', '2007年07月', '2018-12-31/2018年度', '2016-12-31/2016年度', '2019-6-30/2019年1-6月', '2019-06-03/2019年度', '2017-12-31/2017年度', '2017 年', '2018年第 2 季度', '2019 年 1-6 月', '2016 年', '2017年度期末', '2019年6月末', '2016年末', '2018年末', '2017年末', '2019 年 6 月末', '2016年9月7日', '2016年09月07日']
# 消除生成的不稳定的模式的情况,原因:单个数字在-符号中间被标记为x,这是不愿看到的
pat_dateformat = re.compile('(\d{4}-\d{1,2}-\d{1,2})')
def date_standardize(datestring):
m = re.search(pat_dateformat, datestring)
if m==None:
return datestring
#xxxx-xx-xx xxxx-x-xx xxxx-xx-x xxxx-x-x
date = m.group(0)
dates = date.split('-')
month = dates[1]
day = dates[2]
if len(month)!=2:
month = '0'+month
if len(day)!=2:
day = '0'+day
return datestring.replace(date, '-'.join([dates[0], month, day]))
uncover_time_sample_flags = []
for t in uncover_time_sample:
flags, _, _ = sep_flag_pre(date_standardize(t))
flags = flags.replace('|','')
uncover_time_sample_flags.append(flags)
# 所有样例形成的时间POS组合库, 这将作为规则来找寻时间
uncover_time_sample_flags = list(set(uncover_time_sample_flags))
# 重排序保证优先匹配长规则
uncover_time_sample_flags.sort(key=lambda x: len(x), reverse=True)
# 形成uncover时间正则规则
uncover_time_regx = '|'.join(uncover_time_sample_flags)
pat_uncover_time = re.compile(uncover_time_regx)
# 值变动类句子特征(需要结合uncover的时间正则)
pat_change_value1 = re.compile('(d('+uncover_time_regx+')vp?mm)') #日期1科目x较日期2增加/减少数值
# 这种变动类包含取值类句子特征
pat_change_value2 = re.compile('((p|c)('+uncover_time_regx+')ujmmvp('+uncover_time_regx+')ujp?mm)') #从2014年末的数值1增长/减少至2016年末的数值2
# 比例变动类句子特征(需要结合uncover的时间正则)
pat_change_rate = re.compile('(d('+uncover_time_regx+')vp?mx)') #日期1科目x较日期2增加/减少比例
# 值与比例形成的公式完全不同
def re_extractor(pat, flags_plain, delimiter, in_model_matrix=True):
r=[]
for f in flags_plain.split(delimiter):
blocklist = re.findall(pat, f)
if blocklist != []:
blocklist=list(map(lambda x: max(x, key=lambda y: len(y)) if type(x)==tuple else x, blocklist))
else:
r.append(None)
continue
if len(blocklist)==1:
r.append(blocklist[0])
else:
if in_model_matrix == True:
r.append(blocklist[0])
else:
r.append(blocklist)
return r
# 找第no个ujn
def find_ujn(l, no=1):
b_no = no
for i,flag in enumerate(l):
if flag=='uj' and l[i+1]=='n':
if no==1:
return i
else:
no -= 1
print(f'第{b_no}个ujn不存在于该列表中')
return None
def locate_itemindex_in_take_percentage(flags, index_block, current_regx):
original_span = current_regx[1: current_regx.index('ujn')]
print(f'original_span={original_span}')
# 定位文本对象
sub_flags=flags.split(' ')[index_block].split('|')
try:
# 找到所有可能的ujn
for hypo in range(1, len(sub_flags)+1):
index_ujn=find_ujn(sub_flags ,hypo)
# 比对中间片段
for i,flag in enumerate(sub_flags):
if flag=='v':
if i<index_ujn:
# 取中间字符串
middle_span = ''.join(sub_flags[i+1:index_ujn])
print(f'当前middle_span={middle_span}')
if original_span == middle_span:
print(f'found!')
return i+1, index_ujn
else:
print(f'index为{index_ujn}的ujn无效,跳过')
break
except:
print('something wrong, no output')
return None
# 方法与上相同
# 如果应用在主语上难以保证general_regx的唯一性,这个方法仅用于时间的挖取
# no代表第no个匹配项
def locate_itemindex_general(flags, index_block, general_regx, no=1):
#print(f'flags:{flags} index_block:{index_block} general_regx:{general_regx} no:{no}')
if general_regx == None:
return None
if type(general_regx) == list:
print('[method:locate_itemindex_general]---出现了一个多具体时间的列表')
res_index_list = []
for rg in general_regx:
res_index_list.append(locate_itemindex_general(flags, index_block, rg, no))
return res_index_list
span_len = len(general_regx)
sub_flags=flags.split(' ')[index_block].split('|')
try:
count = 0
for i,flag in enumerate(sub_flags):
# 错误的解法 因为不是所有flag都是1位,2位就会扰乱这个逻辑
# for block in range(span_len):
# if flag==general_regx[block]:
# # 向后验证
# flag=sub_flags[i+1]
# else:
# break
# # 当前循环完美走完说明匹配项找到
# return i, i+span_len
# 反向验证
move_step = 1
move_span = ''.join(sub_flags[i: i+move_step])
while move_span in general_regx and move_step<=len(sub_flags):
if move_span == general_regx:
count = count + 1
if no == count:
return i, i+move_step
move_step+=1
move_span = ''.join(sub_flags[i: i+move_step])
except:
print('[method:locate_itemindex_general]---something wrong, no output')
print(f'[method:locate_itemindex_general]--第{index_block}句{general_regx}的时间未找到')
return None
# 占比特征识别出时寻找主语(分子)
# 这是一个需要不断扩充的函数
def locate_numeratorindex_in_take_percentage(words, flags, index_block, take_percentage_start_index):
# 目前考虑四种情况:1.主语全部罗列在本句;2.单主语,需要按照时间展开,在本句
# 3.非本句,全罗列; 4.单主语,需要按照时间展开,非本句
res_index=[]
if take_percentage_start_index-1 == 0: #“占”在句首
# 这得跨句子找
# 重要假设:所在分句子value数对应
index_block = index_block - 1
print(f'向前检查第{index_block}子句')
last_sent_first_word = words.split(' ')[index_block].split('|')[0]
print(f'该句句首:{last_sent_first_word}')
if last_sent_first_word != '占':
return locate_subjectindex_general(words, flags, index_block)
return locate_numeratorindex_in_take_percentage(words, flags, index_block, 1)
else:
# 切分新子句
before_subflags = flags.split(' ')[index_block].split('|')[0:take_percentage_start_index-1]
# 是否有多值
expr = before_subflags.count('c')>=1 and before_subflags.count('x')>=1
# if m!=None:# 有多值 不以正则表达式作为判断依据
if expr:
# 挖值
# 取末尾连接符 方法locate_itemindex_general无意义
# last_c_index = sub_regx[::-1].index('c')
# subject_flag_list = sub_regx[:len(sub_regx)-last_c_index-1].split('x')
# subject_flag_list.append(sub_regx[len(sub_regx)-last_c_index:])
# print(f'多值主语flags: {subject_flag_list}')
# for subject_flag in subject_flag_list:
# res_index.append(locate_itemindex_general(index_block, subject_flag))
base_index = 0 # 这也是一种极端假设
# how many x there are?
num_x = before_subflags.count('x')+1
for i in range(num_x):
if i == 0:
print('first')
s_i = base_index
e_i = before_subflags.index('x')
else:
try:
print('middle')
s_i = e_i + 1
e_i = s_i + before_subflags[s_i: ].index('x')
except:
print('end')
s_i = e_i + 1
e_i = s_i + before_subflags[::-1].index('x') - before_subflags[::-1].index('c') - 1
res_index.append((s_i, e_i))
s_i = e_i + 1
e_i = s_i + before_subflags[::-1].index('c')
res_index.append((s_i, e_i))
break
res_index.append((s_i, e_i))
return index_block, res_index
else: # 单值, 要按照时间scale up, 在gearup函数中
base_index = 0 # 这也是一种极端假设
return index_block, (base_index, take_percentage_start_index-1)
def locate_subjectindex_general(words, flags, index_block, baseindex=0,no=1): # 粗略地确定一个分句的主语
print(f'[method:locate_subjectindex_general]开始定位第{index_block}句主语')
#baseindex=0 # 重要假设:主语从句首开始
sub_flags = flags.split(' ')[index_block].split('|')
count = 0
def return_subjectindex(words, index_block, baseindex, i, no):
# 得到的主语索引需要验证,如果clean后的主语变空了则需要往后找谓语前的名词
if baseindex<i:
name = from_index_to_span(words, index_block, (baseindex, i))
if clean_subject(name) == '':
no += 1
return locate_subjectindex_general(words, flags, index_block, baseindex, no)
return index_block, (baseindex, i)
else:
name = from_index_to_span(words, index_block, (i+1, baseindex))
if clean_subject(name) == '':
no += 1
return locate_subjectindex_general(words, flags, index_block, baseindex, no)
return index_block, (i+1, baseindex)
# 主语规则: 在dp,bp,d或p前的子句部分 类似于一种公式中的等式和集合中的从属符号
for i,flag in enumerate(sub_flags):
if flag == 'd' and sub_flags[i+1] == 'p':
print('[method:locate_subjectindex_general]优先匹配dp前字段')
if i==0: # 句首 则 跳句
index_block = index_block - 1
return locate_subjectindex_general(words, flags, index_block, baseindex)
count += 1
if count == no:
return return_subjectindex(words, index_block, baseindex, i, no)
for i,flag in enumerate(sub_flags):
if flag == 'b' and sub_flags[i+1] == 'p':
print('[method:locate_subjectindex_general]次优先匹配bp前字段')
if i==0: # 句首 则 跳句
index_block = index_block - 1
return locate_subjectindex_general(words, flags, index_block, baseindex)
count += 1
if count == no:
return return_subjectindex(words, index_block, baseindex, i, no)
for i,flag in enumerate(sub_flags):
if flag == 'p':
print('[method:locate_subjectindex_general]再次先匹配p前字段')
if i==0: # 句首 则 跳句
index_block = index_block - 1
return locate_subjectindex_general(words, flags, index_block, baseindex)
count += 1
if count == no:
return return_subjectindex(words, index_block, baseindex, i, no)
for i,flag in enumerate(sub_flags):
if flag == 'd':
print('[method:locate_subjectindex_general]再次匹配d前字段')
if i==0: # 句首 则 跳句
index_block = index_block - 1
return locate_subjectindex_general(words, flags, index_block, baseindex)
count += 1
if count == no:
return return_subjectindex(words, index_block, baseindex, i, no)
# 没有谓语 继续向前
index_block = index_block - 1
return locate_subjectindex_general(words, flags, index_block, baseindex)
def from_index_to_span(words, index_block, args):
words_list = words.split(' ')[index_block].split('|')
if type(args)==tuple:
start_index, end_index = args
res_string = ''.join(words_list[start_index: end_index])
return res_string
if type(args)==list:
res_list = []
for start_index, end_index in args:
res_list.append(''.join(words_list[start_index: end_index]))
return res_list
if args == None:
print('时间查找失败')
return None
print('Unknown type args: {args}')
return None
def pick_value_regx_out(value_block):
if value_block==None:
return value_block
if type(value_block) == str:
value_block = [value_block]
return [''.join([y for x,y in list(pseg.cut(value))]) for value in value_block]
def find_time_regx(values, flags_plain, index_block, pat_time, pat_type):
print(f'[method:find_time_regx->start]---pat_time:{pat_time} pat_type:{pat_type}')
time_regx_list=re_extractor(pat_time, flags_plain, ' ', pat_type)
if pat_type==False: #uncover
# 取出 index_block前所有的value的regx
value_block_regx_list_before_indexblock = [pick_value_regx_out(values[ib]) for ib in range(index_block+1)]
print(f'[method:find_time_regx]---value_block_regx_list_before_indexblock:{value_block_regx_list_before_indexblock}')
# 把value regx都扣掉
time_regx_list=re_extractor(pat_time, flags_plain, ' ', pat_type)
print(f'[method:find_time_regx]---扣值前time_regx_list:{time_regx_list}')
time_regx_list_temp=[]
for irl in range(index_block+1):
if value_block_regx_list_before_indexblock[irl] != None:
for r in value_block_regx_list_before_indexblock[irl]:
print(f'[method:find_time_regx]----从当前值正则列表中取出{r}')
# mx 的值在原句中识别为单m
if r == 'mx' or r == 'xmx' or r == 'mxq':
r = 'm'
if r == 'xmm':
r = 'mm'
try:
print(f'[method:find_time_regx]----当前time_regx_list[{irl}]是{time_regx_list[irl]}')
if type(time_regx_list[irl]) == list:
time_regx_list[irl].reverse()
time_regx_list[irl].remove(r)
time_regx_list[irl].reverse()
else:
if time_regx_list[irl]==r:
time_regx_list[irl] = None
except:
print(f'[method:find_time_regx]-----{r} 不在')
if r == 'mx':
time_regx_list[irl].reverse()
time_regx_list[irl].remove('m')
time_regx_list[irl].reverse()
if r == 'm': # m容易被大正则合并
time_regx_list[irl]=[]
break
if time_regx_list[irl] == []:
time_regx_list[irl] = None
if time_regx_list[irl] != None and len(time_regx_list[irl])==1:
time_regx_list[irl] = time_regx_list[irl][0]
time_regx_list_temp.append(time_regx_list[irl])
time_regx_list = time_regx_list_temp
print(f'[method:find_time_regx]---扣值后time_regx_list:{time_regx_list}')
else: # cover
if not any(time_regx_list):
print('[method:find_time_regx]---无模糊时间匹配,说明有具体时间线')
return find_time_regx(values, flags_plain, index_block, pat_uncover_time, False)
def find_time_regx_sub(index_block, pat_time): # pat_type 为True表示 cover time
print(f'[method:find_time_regx_sub]---time_regx_list:{time_regx_list}, index_block:{index_block}')
time_regx = time_regx_list[index_block]
if index_block == 0:
if type(time_regx)==str:
return time_regx, index_block
else:
print(f'[method:find_time_regx_sub]----time_regx:{time_regx} 是列表')
if time_regx == [None]:
return None, index_block
return time_regx, index_block
if time_regx != None:
return time_regx, index_block
return find_time_regx_sub(index_block-1, pat_time)
time_regx, index_block = find_time_regx_sub(index_block, pat_time)
print('[method:find_time_regx->end]')
return time_regx, index_block
def clean_subject(subjectname):
# 主语优化
if type(subjectname) == str:
if subjectname[0:3]=='发行人':
subjectname = subjectname[3:]
if subjectname[0:3]=='本公司':
subjectname = subjectname[3:]
if subjectname[0:3]=='该公司':
subjectname = subjectname[3:]
if subjectname[0:3]=='最初系':
subjectname = subjectname[3:]
if subjectname[0:2]=='实现':
subjectname = subjectname[2:]
if subjectname[0:2]=='公司':
subjectname = subjectname[2:]
if subjectname[0:2]=='本期':
subjectname = subjectname[2:]
if subjectname[0:2]=='分别':
return clean_subject(subjectname[2:])
if subjectname[0:2]=='其中':
return clean_subject(subjectname[2:])
if subjectname[0:2]=='主要':
return clean_subject(subjectname[2:])
if subjectname[0:2]=='包括':
return clean_subject(subjectname[2:])
if subjectname[0:1]=='由':
return clean_subject(subjectname[1:])
if subjectname!='' and subjectname[-1]=='占':
subjectname = subjectname[:-1]
if subjectname[-2:] == '变更':
subjectname = subjectname[:-2]
if subjectname[-2:] == '增至':
subjectname = subjectname[:-2]
if subjectname[-2:] == '减至':
subjectname = subjectname[:-2]
if subjectname[-3:] == '增加至':
subjectname = subjectname[:-3]
if subjectname[-3:] == '减少至':
subjectname = subjectname[:-3]
else:
new_subjectname = []
for s in subjectname:
s = clean_subject(s)
new_subjectname.append(s)
return new_subjectname
return subjectname
# 判断当前所获得时间是否需要继续前跳
def is_nonetime(time_string):
contain_time_item_pool = ['一年内','少数股东权益']
if type(time_string) == list:
return False
if time_string.isdigit():
return len(time_string) != 4 or time_string[:2] not in ('19','20')
return time_string in contain_time_item_pool
# 模式找寻
# 公式一-占比类: 占x的比重是目前唯一出现需要判断公式的地方
# 公式二-变动类: 日期1科目x较日期2增加/减少数值/比例
# 主语继承,主语穿透
# 归因总结语句假设不重要
# 子句主语识别,以value数为主语数基准
def gearup(flags, words, values, flags_plain, quadraples, index_block, value_block, subjectname, has_formula, related_subjectname=None):
# 这个方法聚合四元组并解析时间
print(f'[method:gearup->start]---value_block:{value_block} subjectname:{subjectname} has_formula:{has_formula} related_subjectname:{related_subjectname} ')
# 对输入进行校验
if has_formula and related_subjectname==None:
print('公式涉及元素不全')
return None
subjectname = clean_subject(subjectname)
if related_subjectname != None:
related_subjectname = clean_subject(related_subjectname)
# 情况1: 多个主语,那么时间是复用的(找到唯一时间)。重要假设:这种情况的时间是uncover_time
if type(subjectname) == list:
print('[method:gearup]----情况1')
# 从本句开始向前子句找
time_regx, target_indexblock = find_time_regx(values, flags_plain, index_block, pat_uncover_time, False)
time_string = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx))
print(f'[method:gearup]----多主语对应具体时间为: {time_string}')
if time_string in subjectname:
subjectname = subjectname.replace(time_string, '')
subjectname = clean_subject(subjectname)
# 整理结果
for ni in range(len(subjectname)):
quadraple_dict={}
if has_formula:
quadraple_dict['item'] = subjectname[ni] + '/' + related_subjectname
else:
quadraple_dict['item'] = subjectname[ni]
quadraple_dict['time'] = time_string
quadraple_dict['value'] = value_block[ni] # potencial error
quadraples.append(quadraple_dict)
# 情况2: 单分子/单主语,默认是时间线,时间是展开的。重要假设:这种情况的时间是cover_time
# 情况3: 单主语为变动类模式下,时间是uncover_time
# 情况4: 单主语为单值模式,时间是uncover_time,无公式
else:
print(f'[method:gearup]---related_subjectname:{related_subjectname}, subjectname:{subjectname}, has_formula:{has_formula}')
#情况3
if related_subjectname != None and related_subjectname == subjectname: # 变动类 相关主语为其他时间下的subjectname
print('[method:gearup]----情况3')
time_regx, target_indexblock = find_time_regx(values, flags_plain, index_block, pat_uncover_time, False)
print(f'[method:gearup]----单主语[变动特征]对应具体时间正则为: {time_regx}')
if type(time_regx) == list:
time_string1 = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx[0], 1))
if time_regx[0] == time_regx[1] or time_regx[1] in time_regx[0]:# 相同正则或包含 则 按count往后寻
time_string2 = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx[1], 2))
else:
time_string2 = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx[1], 1))
else:
time_string2 = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx))
# 以 has_formula 区分是否是 变动1类(True)或者变动2类(False)
if has_formula:
# 向前找具体时间
time_regx, target_indexblock = find_time_regx(values, flags_plain, index_block-1, pat_uncover_time, False)
if type(time_regx) != list:
time_string1 = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx))
else:
time_string1 = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx[0]))
else:
if value_block[-1] != '%': # 以值是百分比还是数额 区分补充语句是独立还是继承
print('[method:gearup]-----补充语句为独立语句不再上溯时间')
time_string1 = '无'
else:
print('[method:gearup]-----补充语句为进一步解释性语句,沿用变动1类时间以及主语')
# time_regx, target_indexblock = find_time_regx(values, flags_plain, index_block-1, pat_uncover_time, False)
# 默认为 变动1类
# time_string1 = from_index_to_span(target_indexblock, locate_itemindex_general(target_indexblock, time_regx))
# time_string2 = time_string1
time_string1 = '沿用'
time_string2 = '沿用'
print(f'[method:gearup]----单主语[变动特征]对应具体时间为: {time_string1} 和 {time_string2}')
# 主语中由于是基于假设从句首开始,需要屏蔽时间开头
if time_string1 in subjectname:
subjectname = subjectname.replace(time_string1, '')
if time_string2 in subjectname:
subjectname = subjectname.replace(time_string2, '')
if subjectname.count('较') != 0:
subjectname = subjectname[:subjectname.index('较')]
subjectname = clean_subject(subjectname)
# 判断变动类型
if type(value_block) == list: # 多值域的不变动类是值变动类2
time_strings = [time_string1, time_string2]
for vi in range(len(value_block)):
quadraple_dict={}
quadraple_dict['item'] = subjectname
quadraple_dict['time'] = time_strings[vi]
quadraple_dict['value'] = value_block[vi]
quadraples.append(quadraple_dict)
else:
quadraple_dict={}
if not has_formula: # 补充句式
if value_block[-1] != '%':
quadraple_dict['item'] = subjectname
quadraple_dict['time'] = time_string2
quadraple_dict['value'] = value_block
quadraples.append(quadraple_dict)
else:
last_quadraple_dict = quadraples[-1]
quadraple_dict['item'] = last_quadraple_dict['item'].split(' ')[0] + ' 的 ' + subjectname
quadraple_dict['time'] = last_quadraple_dict['time']
quadraple_dict['value'] = value_block
quadraples.append(quadraple_dict)
else:
# 变动类一定是公式 且均为单个值
word_list = words.split(' ')[index_block].split('|')
if value_block[-1] == '%' or value_block[-1] == '元':
v = value_block[:-1]
if value_block[-1] == '元' and (value_block[-2] == '万' or value_block[-2] == '亿'):
v = value_block[:-2]
verb_index = word_list.index(v)
verb = word_list[verb_index-2]
print(f'[method:gearup]-----数值{v}前的动词:{verb}')
quadraple_dict['item'] = subjectname + ' 的 ' + verb
quadraple_dict['time'] = time_string2 + ' 至 ' + time_string1
quadraple_dict['value'] = value_block
quadraples.append(quadraple_dict)
else:
#情况4
if related_subjectname==None and not has_formula and type(value_block)==str:
print('[method:gearup]----情况4')
time_regx, target_indexblock = find_time_regx(values, flags_plain, index_block, pat_uncover_time, False)
time_string = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx))
if time_string == None:
return
while is_nonetime(time_string) and target_indexblock >= 0:
print(f'[method:gearup]----跳过具体时间{time_string}(主语成分)')
time_regx, target_indexblock = find_time_regx(values, flags_plain, target_indexblock-1, pat_uncover_time, False)
time_string = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx))
print(f'[method:gearup]----单主语[单值特征]对应具体时间为: {time_string}')
if type(time_string) == list and len(time_string) == 2:
print('[method:gearup]-----该句为变动2类的补充句式')
quadraple_dict={}
quadraple_dict['item'] = quadraples[-1]['item'].split(' ')[0] + ' 的 ' + subjectname
quadraple_dict['time'] = quadraples[-1]['time']
quadraple_dict['value'] = value_block
quadraples.append(quadraple_dict)
else:
try:
if len(quadraples) != 0:
is_first = True
for qi in range(len(quadraples)+1): # 逆向遍历
if is_first:
each_previous_subjectname =subjectname
is_first=False
else:
each_previous_subjectname = quadraples[::-1][qi-1]['item']
while each_previous_subjectname.count(time_string) != 0 and each_previous_subjectname.index(time_string) > 0 or is_nonetime(time_string):
print(f'[method:gearup]-----{time_string}在主语中需跳过')
index_block = index_block - 1
time_regx, target_indexblock = find_time_regx(values, flags_plain, index_block, pat_uncover_time, False)
time_string = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx))
if time_string == None:
return
print(f'[method:gearup]-----单主语[单值特征]再次确认的对应具体时间为: {time_string}')
else:
while subjectname.count(time_string) != 0 and subjectname.index(time_string) > 0:
print(f'[method:gearup]-----{time_string}在主语中需跳过')
index_block = index_block - 1
time_regx, target_indexblock = find_time_regx(values, flags_plain, index_block, pat_uncover_time, False)
time_string = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx))
if time_string == None:
return
print(f'[method:gearup]-----单主语[单值特征]再次确认的对应具体时间为: {time_string}')
except:
print(f'[method:gearup]-----找不到具体时间并引发下标超限出错,判定为模糊时间再次查找')
time_regx, target_indexblock = find_time_regx(values, flags_plain, index_block, pat_cover_time, True)
time_string = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx))
if time_string == None:
return
print(f'[method:gearup]-----单主语重定位的模糊时间为: {time_string}')
if time_string in subjectname and subjectname.index(time_string) == 0:
subjectname = subjectname.replace(time_string, '')
subjectname = clean_subject(subjectname)
quadraple_dict={}
quadraple_dict['item'] = subjectname
quadraple_dict['time'] = time_string
quadraple_dict['value'] = value_block
quadraples.append(quadraple_dict)
else:#情况2
print('[method:gearup]----情况2')
time_regx, target_indexblock = find_time_regx(values, flags_plain, index_block, pat_cover_time, True)
time_string = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx))
if time_string == None:
return
if type(time_string) == str and type(value_block) == list:
print(f'[method:gearup]-----单主语多值域对应单模糊时间为: {time_string}')
if time_string == '月末' or time_string == '年末':
print(f'[method:gearup]----模糊识别有误,转为识别具体时间,默认为多时间')
time_regx, target_indexblock = find_time_regx(values, flags_plain, index_block, pat_uncover_time, False)
time_string = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx))
while is_nonetime(time_string) and target_indexblock >= 0:
print(f'[method:gearup]----跳过具体时间{time_string}(主语成分)')
time_regx, target_indexblock = find_time_regx(values, flags_plain, target_indexblock-1, pat_uncover_time, False)
time_string = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx))
print(f'[method:gearup]----多时间确认为{time_string}')
if type(time_string) == list and len(time_string) != len(value_block):
time_string = ' - '.join(time_string) # 先当作模糊时间黏合在一起,因为不能按和值域相同的个数对应
if type(time_string) == list and len(time_string) == len(value_block):
for vi in range(len(value_block)):
quadraple_dict={}
if has_formula:
quadraple_dict['item'] = subjectname + '/' + related_subjectname
else:
quadraple_dict['item'] = subjectname
quadraple_dict['time'] = time_string[vi]
quadraple_dict['value'] = value_block[vi]
quadraples.append(quadraple_dict)
# 这里结束了
if type(time_string) == str:
if time_string in subjectname:
subjectname = subjectname.replace(time_string, '')
subjectname = clean_subject(subjectname)
# 暂时不转译模糊时间,原因:赶紧写个大概总纲,细节以后丰富
for vi in range(len(value_block)):
quadraple_dict={}
if has_formula:
quadraple_dict['item'] = subjectname + '/' + related_subjectname
else:
quadraple_dict['item'] = subjectname
quadraple_dict['time'] = time_string + str(vi) +'<待解析>'
quadraple_dict['value'] = value_block[vi]
quadraples.append(quadraple_dict)
else:
if type(time_string) == list and type(value_block) == list:
print(f'[method:gearup]-----单主语多值域对应多具体时间为: {time_string}')
if time_string[-1] in subjectname:
subjectname = subjectname[subjectname.index(time_string[-1])+len(time_string[-1]):]
subjectname = clean_subject(subjectname)
for vi in range(len(value_block)):
quadraple_dict={}
if has_formula:
quadraple_dict['item'] = subjectname + '/' + related_subjectname
else:
quadraple_dict['item'] = subjectname
quadraple_dict['time'] = time_string[vi]
quadraple_dict['value'] = value_block[vi]
quadraples.append(quadraple_dict)
elif type(time_string) == str and type(value_block) == str:
print(f'[method:gearup]-----单主语单值域对应单个具体时间为: {time_string}')
# 逻辑误区:这里按条件来说是具体时间,但是我默认取得时间是按cover来的,所以一旦击中模糊时间而不是在find_time_regx内反转就是逻辑错误
# 所以我这里必须要强行转化一次
if time_string != '本期': # 本期是一种模糊的具体
time_regx, target_indexblock = find_time_regx(values, flags_plain, index_block, pat_uncover_time, False)
time_string = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx))
if time_string == None:
return
# 这里我再次假设出来的时间是单时间
print(f'[method:gearup]-----单主语单值域对应单个具体时间再次确认为: {time_string}')
while is_nonetime(time_string) and target_indexblock >= 0:
print(f'[method:gearup]----跳过具体时间{time_string}(主语成分)')
time_regx, target_indexblock = find_time_regx(values, flags_plain, target_indexblock-1, pat_uncover_time, False)
time_string = from_index_to_span(words, target_indexblock, locate_itemindex_general(flags, target_indexblock, time_regx))
if time_string in subjectname:
subjectname = subjectname[subjectname.index(time_string)+len(time_string):]
subjectname = clean_subject(subjectname)
quadraple_dict={}
if has_formula:
quadraple_dict['item'] = subjectname + '/' + related_subjectname
else:
quadraple_dict['item'] = subjectname
quadraple_dict['time'] = time_string
quadraple_dict['value'] = value_block
quadraples.append(quadraple_dict)
print('[method:gearup->end]')
# 按顺序放入规则列表
pat_repository = [pat_multi_value, pat_multi_rate, pat_change_value1, pat_change_value2, pat_change_rate, pat_take_percentage, pat_single_value, pat_multi_subject_value]
# 由值出发,为值匹配公式或者(科目,时间)
# 找值
def dragout(flags, words, sentence):
quadraples=[]
# flags, words, sentence 是 sep_flag_pre的结果
flags_plain = flags.replace('|','')
print(f'[method:dragout->start]--words:{words}')
print(f'[method:dragout->start]--flags:{flags}')
print(f'[method:dragout->start]--flags_plain:{flags_plain}')
print(f'[method:dragout->start]--sentence:{sentence}')
pattern_matrix = np.mat([re_extractor(pat, flags_plain, ' ') for pat in pat_repository])
# 规则组装器
values = re_extractor(pat_value_word, sentence, ',', False)
if not any(values):
print(f'[method:dragout->end]--该句无值域')
return
print(f'[method:dragout]--全句值域:{values}')
for index_block, value_block in enumerate(values):
# 分子句值集合
if value_block != None:
try:
# 寻找对应模式
match_patterns = [(index_pat, mp) for index_pat, mp in enumerate(pattern_matrix[:,index_block].transpose().getA()[0].tolist()) if mp != None]
match_patterns_dict = dict(match_patterns)
match_pattern_index_set = set(match_patterns_dict.keys()) # 模式的判断依据
print(f'[method:dragout]---match_pattern_index_set: {match_pattern_index_set}')
if match_pattern_index_set=={1,5,6,7}:# 规则一:“占比特征”与“多比例特征”在同一子句中,被认定为同一组合
print(f'[method:dragout]----子句:{sentence.split(",")[index_block]} 击中正则 pat_multi_rate(多比例) -> {match_patterns_dict[1]} 和 pat_take_percentage(占比) -> {match_patterns_dict[5]}')
print(f'[method:dragout]----占比公式值为: {value_block}')
# 得出具体情况下的占比特征的flags
perctg_regx = match_patterns_dict[5]
# 确认分母
take_percentage_start_index, take_percentage_end_index = locate_itemindex_in_take_percentage(flags, index_block, perctg_regx)
denominator = from_index_to_span(words, index_block, (take_percentage_start_index, take_percentage_end_index))
print(f'[method:dragout]----占比公式分母为: {denominator}')
# 确认分子/主语 可能跳句
target_block, numerator_index = locate_numeratorindex_in_take_percentage(words, flags, index_block, take_percentage_start_index)
numerator = from_index_to_span(words, target_block, numerator_index)
print(f'[method:dragout]----占比公式分子为: {numerator}')
gearup(flags, words, values, flags_plain, quadraples, index_block, value_block, numerator, True, denominator)
if match_pattern_index_set=={1,5,6} or match_pattern_index_set=={5,6}:
if '占' in sentence.split(",")[index_block]:# 再次确认
print(f'[method:dragout]----子句:{sentence.split(",")[index_block]} 击中正则 pat_take_percentage(占比) -> {match_patterns_dict[5]}')
print(f'[method:dragout]----占比公式值为: {value_block}')
# 得出具体情况下的占比特征的flags
perctg_regx = match_patterns_dict[5]
# 确认分母
take_percentage_start_index, take_percentage_end_index = locate_itemindex_in_take_percentage(flags, index_block, perctg_regx)
denominator = from_index_to_span(words, index_block, (take_percentage_start_index, take_percentage_end_index))
print(f'[method:dragout]----占比公式分母为: {denominator}')
# 确认分子/主语 可能跳句
target_block, numerator_index = locate_numeratorindex_in_take_percentage(words, flags, index_block, take_percentage_start_index)
numerator = from_index_to_span(words, target_block, numerator_index)
print(f'[method:dragout]----占比公式分子为: {numerator}')
gearup(flags, words, values, flags_plain, quadraples, index_block, value_block, numerator, True, denominator)
else:# 误判
print(f'[method:dragout]----误判为占比类型,流转为单值赋值模式')
match_pattern_index_set={6}
if match_pattern_index_set=={0,6,7} or match_pattern_index_set=={0,6}: # 规则二:“多值特征”
print(f'[method:dragout]----子句:{sentence.split(",")[index_block]} 击中正则 pat_multi_value(多值) -> {match_patterns_dict[0]}')
print(f'[method:dragout]----并列多值为: {value_block}')
# 确认主语
target_block, (name_start_index, name_end_index) = locate_subjectindex_general(words, flags, index_block)
name = from_index_to_span(words, target_block, (name_start_index, name_end_index))
print(f'[method:dragout]----分句主语为: {name}')
if name.count('、') != 0:
print(f'[method:dragout]-----主语含有、号又且被多值特征击中,说明为多主语并列')
names = name.split('、')
if '以及' in names[-1]:
lat = names[-1].split('以及',1)
elif '及' in names[-1]:
lat = names[-1].split('及',1)
elif '和' in names[-1]:
lat = names[-1].split('和',1)
names.pop(-1)
names.extend(lat)
gearup(flags, words, values, flags_plain, quadraples, index_block, value_block, names, False)
else:
gearup(flags, words, values, flags_plain, quadraples, index_block, value_block, name, False)
if match_pattern_index_set=={1, 6}: # 规则三:“多比例特征”
print(f'[method:dragout]----子句:{sentence.split(",")[index_block]} 击中正则 pat_multi_value(多比例) -> {match_patterns_dict[1]}')
print(f'[method:dragout]----并列多百分比为: {value_block}')
target_block, (name_start_index, name_end_index) = locate_subjectindex_general(words, flags, index_block)
name = from_index_to_span(words, target_block, (name_start_index, name_end_index))
print(f'[method:dragout]----分句主语为: {name}')
gearup(flags, words, values, flags_plain, quadraples, index_block, value_block, name, False)
if match_pattern_index_set=={2, 6} or match_pattern_index_set=={4, 6}: # 规则四: “值/比例变动特征” # 可能会在后续的子句中有变动的百分比
print(f'[method:dragout]----子句:{sentence.split(",")[index_block]} 击中正则 pat_change_value1/rate(值变动1/比例变动) -> {match_patterns_dict}')
print(f'[method:dragout]----变动值为: {value_block}')
target_block, (name_start_index, name_end_index) = locate_subjectindex_general(words, flags, index_block)
name = from_index_to_span(words, target_block, (name_start_index, name_end_index))
print(f'[method:dragout]----分句主语为: {name}')
gearup(flags, words, values, flags_plain, quadraples, index_block, value_block, name, True, name)
if match_pattern_index_set=={3}: # 规则五:“值变动+值赋值(变动类的第二种)”
print(f'[method:dragout]----子句:{sentence.split(",")[index_block]} 击中正则 pat_change_value2(值变动2) -> {match_patterns_dict[3]}')
print(f'[method:dragout]----变动值为: {value_block}')
target_block, (name_start_index, name_end_index) = locate_subjectindex_general(words, flags, index_block)
name = from_index_to_span(words, target_block, (name_start_index, name_end_index))
print(f'[method:dragout]----分句主语为: {name}')
gearup(flags, words, values, flags_plain, quadraples, index_block, value_block, name, True, name)