This repository has been archived by the owner on Jul 1, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdoi-wos-authors.py
executable file
·97 lines (87 loc) · 3.12 KB
/
doi-wos-authors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/python
# -*- coding: utf-8 -*-
""" Bot to read autors and DOIs from a Web of Science (WoS/WoK) publication record. """
#
# (C) Federico Leva, 2017
#
# Distributed under the terms of the MIT license.
#
__version__ = '0.2.0'
from lxml import etree
import unicodecsv as csv
import os
doicsv = open('doi-authors.csv', 'w')
header = ['Full name', 'First name', 'Last name', 'Entity', 'DOI', 'Title', 'email']
out = csv.writer(doicsv,
delimiter='\t',
lineterminator='\n',
quoting=csv.QUOTE_MINIMAL)
out.writerow(header)
for xml in [ each for each in os.listdir('.') if each.endswith('.xml') ]:
try:
record = etree.parse(open(xml, 'r'))
if not record.xpath('/records/REC'):
print u"Empty record"
continue
# Addresses can be in one or both of the following
addresses = (record.xpath('//fullrecord_metadata/*[name()="reprint_addresses"]') + record.xpath('//fullrecord_metadata/*[name()="addresses"]'))[0]
addresses_count = int(addresses.xpath('./@count')[0])
# Fetch the names and email addresses from the summary,
# then look elsewhere for details.
names = record.xpath('//summary/names//name')
org = None
for name in names:
if addresses_count == 1:
# All names are from the same address/organization
try:
org = addresses.xpath('//organizations')[0]
except IndexError:
org = None
elif not addresses_count == 0:
# We have some addresses but need to match them
# Each name can have one or both of these IDs
try:
author_id = int( name.xpath('./@dais_id')[0] )
author_id_ng = 0
except IndexError:
author_id = 0
try:
author_id_ng = int( name.xpath('./@daisng_id')[0] )
except IndexError:
author_id_ng = 0
try:
# Look for the same author by its ID in the address list
# address_number = int(addresses.xpath('//name[(@dais_id = %d or @daisng_id = %d) and @addr_no > 0]/@addr_no' % (author_id, author_id_ng) )[0])
org = addresses.xpath('//address_spec[@addr_no = 1]/organizations')[0]
except IndexError:
print "Could not find org for author %d/%d in %s" % (author_id, author_id_ng, xml)
org = None
if org is not None:
try:
# Try to fetch the preferred/controlled name for the org/address
org_name = (org.xpath('organization[@pref="Y"]/text()') + org.xpath('organization[1]/text()'))[0]
except IndexError:
org_name = ''
else:
org_name = ''
# None of these fields is guaranteed to exist
try:
first_name = name.xpath('first_name/text()')[0].strip()
except:
first_name = ''
try:
last_name = name.xpath('last_name/text()')[0].strip()
except:
last_name = ''
try:
full_name = name.xpath('full_name/text()')[0].strip()
except:
full_name = ''
author = [ full_name, first_name, last_name, org_name,
record.xpath('//identifier[@type = "doi" or @type = "xref_doi"]/@value')[0].strip(),
record.xpath('//title[@type = "item"]/text()')[0].strip() ]
if name.xpath('email_addr'):
author.append( name.xpath('email_addr/text()')[0].strip() )
out.writerow(author)
except (etree.ElementTree, etree.XMLSyntaxError):
print("%s is not a valid XML file" % xml)