forked from Langenscheiss/bibitnow
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHOW_TO_CONTRIBUTE.tex
executable file
·471 lines (404 loc) · 45 KB
/
HOW_TO_CONTRIBUTE.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
\documentclass[
a4paper,
12pt,
]
{article}
\usepackage[english]{babel}
\usepackage[twoside, a4paper, lmargin=3.5cm, rmargin=3.0cm, top=2.5cm, bottom=2.5cm]{geometry}
\usepackage[T1]{fontenc}
\usepackage[ansinew]{inputenc}
\usepackage{lmodern}
\usepackage{caption}
\usepackage{graphicx}
\usepackage{graphics}
\usepackage{wrapfig}
\usepackage{color}
\usepackage{upgreek}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{empheq}
\usepackage{dsfont}
\usepackage{trsym}
\usepackage{listings}
\usepackage{pifont}
\usepackage{ifthen}
\usepackage{calc}
\usepackage{enumerate}
\usepackage{hyperref} \hypersetup{colorlinks=true,linktocpage=true,linkcolor=blue,breaklinks=true,citecolor=blue,urlcolor=blue} %
\usepackage[anythingbreaks]{breakurl} % Links can be split over several lines. Only works with pdflatex compiler, unfortunately :(
\definecolor{light-gray}{gray}{0.95}
\definecolor{light-blue}{rgb}{0.89,0.992,1.0}
\lstdefinelanguage{javascript}{
keywords={typeof, new, true, false, catch, function, return, null, catch, switch, var, if, in, while, do, else, case, break},
% keywordstyle=\color{blue}\bfseries,
ndkeywords={class, export, boolean, throw, implements, import, this},
ndkeywordstyle=\color{darkgray}\bfseries,
identifierstyle=\color{black},
sensitive=false,
comment=[l]{//},
morecomment=[s]{/*}{*/},
commentstyle=\color{purple}\ttfamily,
stringstyle=\color{red}\ttfamily,
morestring=[b]',
morestring=[b]"
}
\begin{document}
% new commands
\newcommand{\plgdir}{\$PLUGINDIR}
\newcommand{\gitdir}{\$GITDIR}
\newcommand{\tmpl}{0\_TEMPLATE.js}
\newcommand{\exmpl}{0\_EXAMPLE.js}
\newcommand{\pathbox}[1]{\begin{center}\colorbox{light-gray}{#1}\end{center}}
\newcommand{\App}[1]{App.~\ref{#1}}
\newcommand{\Sec}[1]{Sec.~\ref{#1}}
\newcommand{\plgname}{\textbf{BibItNow!}}
% end new commands
% title
\title{{\plgname} Site Adjusters -- How to contribute?}
\date{\today}
\author{Langenscheiss}
\maketitle
% end title
\tableofcontents
\section*{Welcome}
Hej.\\
If you have decided to contribute by writing your own site adjusters, thanks a lot! This is really going to help me! So let me return the favor and help you getting started with this little step-by-step guide.
\section{Step 1 -- Get full source code}\label{sec_1}
The code in this \href{https://github.com/Langenscheiss/bibitnow}{github repository} only exposes the parts of the plugin for which I currently (you may always inspire me to change this policy) accept external contributions. However, for testing/debugging purposes, or for figuring out how the code works if you wish, you obviously need the full source code. You can \href{https://aqpl.mc2.chalmers.se/PDSU/files/BibItNowMultiBrowser.zip}{download} the latest developer versions for all currently supported browsers from my website.\\
Since site adjusters work browser-independently, you may pick whatever browser-variant you prefer as your own developer version. In the following, we will denote the local plugin root directory, i.e., where \textit{manifest.json} is located, as ``\plgdir''. The corresponding root directory in the github repository will be called ``\gitdir''.
\section{Step 2 -- Enabled desired site}\label{sec_2}
The next step is to add the desired website to the plugin. Open the Adjuster List located in
\pathbox{\plgdir/nameResources/urlSpecificAdjusterList.json},
written in the \href{https://en.wikipedia.org/wiki/JSON}{JSON} format. Add an adjuster entry to this file. The structure is pretty self-explanatory. Specify the \href{https://en.wikipedia.org/wiki/URL}{URL} scheme, and the filenames for the adjuster scripts. For example, if you wish to add the website ``https://www.johndoe.com'', and you wish to link it to script adjuster files ``johndoe.js'' (both for preformatter and prefselector), you add the object
\begin{center}
\lstset{language=javascript,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
{ "scheme": "johndoe" , "top": "com" ,
"prefselector": "johndoe" , "preformatter": "johndoe"}
\end{lstlisting}
\end{center}
to the JSON array. Note that the names of script files are stated \textbf{without the .js extension}. Details about how to specify more complex matching schemes, and how these schemes are then matched with the found URL are described in \App{app_matching}.\par
Once you have added an entry to the adjuster list, linking an URL scheme to preformatter/prefselector script files, you need to make sure these files exist, as the plugin will crash otherwise if the URL scheme is positively matched. If you have specified ``johndoe'' as preformatter, add a copy of ``\tmpl'' from the preformatter directory in the github repository
\pathbox{\gitdir/background/preformatters/}
to
\pathbox{\plgdir/background/preformatters/}
and rename it to ``johndoe.js''. Follow the same procedure for the ``prefselector'' specified in the prefselector file, with ``\tmpl'' now of course taken from
\pathbox{\gitdir/extractors/prefselectors/}
and copied to
\pathbox{\plgdir/extractors/prefselectors/}.
These 3 steps should be enough to let the plugin know that a prefselector and preformatter should be loaded for the given website. If you haven't adjusted the template files, and everything is correct, the global web console should show the message ``This seems to work!'' if you activate the plugin popup while surfing on the given website.
%
\section{Step 3 -- Find optimal prefselectors}\label{sec_3}
Now, the potentially more difficult part starts. You need to study the source of abstract/article article pages in order to figure out \href{https://www.w3schools.com/cssref/css_selectors.asp}{CSS selectors} which select the desired citation info to be read by the extractor. The first thing to look for are meta tags. Decent publishing houses put the most relevant citation info into such meta tags. Hence, {\plgname} has a fixed kernel of search queries for most bibliography fields -- henceforth shortened as ``bibfields'' -- and if you are lucky, those queries are already enough to complete a citation (details about the bibfields and their corresponding default search queries are presented in \App{app_bibfields}).
However, often you are not that lucky, and you need to be more inventive in providing custom CSS selectors for search queries. Technically, these search queries are carried out PRIOR to those defined in the fixed kernel, thereby suggesting the name ``preferred selector'' or ``prefselector'' as a short form.\par
Once you think you have found the info in the website \href{https://en.wikipedia.org/wiki/HTML}{HTML} code\footnote{Note that {\plgname} queries code \textbf{after} the page has finished loading, possibly including effects of dynamically loaded scripts on the website's DOM. The static source code might hence not always be representative of the information that is available to the plugin. In other words, be sure to inspect the final DOM in case a CSS selector does not find the desired information.} and the corresponding CSS selector for, e.g., the author(s) of an article, refer to \App{app_prefselector} and to the \href{https://github.com/Langenscheiss/bibitnow/blob/master/extractors/prefselectors/0_EXAMPLE.js}{example file}
\pathbox{\gitdir/extractors/prefselectors/\exmpl}
in order to understand how to precisely link a CSS selector to a certain bibfield, which in this case would be \textit{citation\_authors}.\par
The basic procedure is to provide, as first argument, a CSS selector selecting the desired HTML element, and as second argument, an attribute of the selected element that should be read out and saved for the corresponding bibfield. You can add as many custom queries as you want, and each query can be further specified with several optional arguments next to the CSS selector and the attribute. For example, if you have found the author info in the two non-standard meta tags
\begin{center}
\lstset{language=Html,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
<meta name="bla_author" content="John Doe"></meta>
<meta name="bla_author" content="Jane Doe"></meta>
\end{lstlisting}
\end{center}
which are not recognized by the fixed kernel, you will have to add the property
\begin{center}
\lstset{language=javascript,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
citation_authors : [['meta[name="bla_author"]','content']]
\end{lstlisting}
\end{center}
that is, with CSS selector ``meta[name="bla\_author"]'' and attribute ``content'' (since the ``content'' attribute of the meta tags contains the data) to the JSON object \textit{prefselectorMsg} in the prefselector script file.\par
As explained in \App{app_bibfields_class} for the author bibfield, the extractor will select EVERY ELEMENT that is found using the specified prefselector string, and concatenates everything into a semicolon-separated list, as the main parser expects this format. So, if the above author information was instead given in a single tag such as
\begin{center}
\lstset{language=Html,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
<meta name="bla_author"
content="John Doe and Jane Doe"></meta>
\end{lstlisting}
\end{center}
you will first have to replace the ''and`` by a semicolon. This is one main reason why {\plgname} has preformatters, see \Sec{sec_5}.
The file \href{https://github.com/Langenscheiss/bibitnow/blob/master/extractors/prefselectors/0_EXAMPLE.js}{``\exmpl``} shows more examples of adding prefselectors. As a general rule, try to use CSS selectors which read the info in a robust way. Remember that websites change from time to time.
\section{Step 4 -- Parse a link for dynamic citation export if possible}\label{sec_4}
While not necessarily something for your first shot at writing a site adjuster, remember that a core functionality of {\plgname} is to communicate with the citation export/''Download citation``-button offered on abstract pages of most publishers/databases. In the absolute majority of all cases, these buttons are technically form submission buttons or simple file links, i.e., something that can be called with an \href{https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest}{XHR} to a specifically formatted URL.
Since the main parser is expecting the downloaded citation to be in the \href{https://en.wikipedia.org/wiki/RIS_(file_format)}{RIS citation format}, it is highly recommended -- yet not necessary -- to parse a URL which links to a resource in this RIS format.\par
To determine the URL per citation, {\plgname} provides 2 stages:
\begin{enumerate}
\item In the \textit{prefselectorMsg} object, the \textit{citation\_download} property allows to query a download link from the abstract page.
\item If such a download link is found, this link together with all extracted static data is passed to the second stage -- the \textit{formatCitationLink} function defined in the prefselector script file. Possibly using all static citation data including a citation URL, this function allows to specify the XHR method (\href{https://en.wikipedia.org/wiki/XMLHttpRequest#The_open_method}{GET or POST}) and requires you to return a formatted and finalized request URL. \textbf{Note} that if an invalid URL is returned, or if \textbf{no preformatting script has been found for the website}, the plugin will skip the dynamic citation download request altogether.
If, however, a request is sent, and if it finishes with status code 200 (OK, successful), the response data will be saved as text into the \textit{citation\_download} property of the \textit{metaData} object accessible in preformatting.
\end{enumerate}
There are a number of important rules to obey in using the static citation data, and in parsing the request URL.
\begin{enumerate}
\item \textbf{Rule 1: Never parse any data to anything but text.}
It is sometimes tempting to use \href{https://en.wikipedia.org/wiki/XMLHttpRequest#The_open_method}{\textit{eval}} on data, or to assign it to the \href{https://developer.mozilla.org/en-US/docs/Web/API/Element/innerHTML}{\textit{innerHTML}} property of a \href{https://www.w3schools.com/jsref/dom_obj_attributes.asp}{DOM node}. However, for security reasons, this will not be tolerated in any code of {\plgname}, including site adjusters. In particular, Mozilla warns against this practice when submitting web extensions.
It is allowed to ''try-catch`` a \href{https://www.w3schools.com/jsref/dom_obj_attributes.asp}{\textit{JSON.parse}} on text data, as the content of the returned JSON object is interpreted as data only.
However, the author generally advises against parsing with anything but string methods if it does not require too much extra code. Reading, e.g., a single property from a JSON string can often be done with a simple regular expression; it does not require you to parse the whole string into a JSON object. Note also that performance is currently not a bottle neck for {\plgname}, so you can always afford to manually parse raw strings with regexp magic.
\item \textbf{Rule 2: Avoid cross-site and mixed-content requests.} Modern browsers prohibit cross-site and mixed-content XHR. In other words, for the XHR to be successful, you need to stay on the same domain as in the active tab, from which you have extracted the static citation data, and you may not switch between ''http`` and ''https``. There is only one exception allowed by {\plgname} : cross-site requests to ''citation-needed.springer``. This exception currently exists because the publisher ''Nature`` is in the process of merging with the publisher ''Springer``.
\end{enumerate}
Refer to the example file \href{https://github.com/Langenscheiss/bibitnow/blob/master/extractors/prefselectors/0_EXAMPLE.js}{''\exmpl``} for a demonstration of how to successfully parse a citation download link.
\section{Step 5 -- The most important step: preformatting}\label{sec_5}
As already mentioned in \Sec{sec_3}, the raw data extracted from the website source is, in many cases, not immediately ready to be understood by the main parser, and sometimes not even available at all (which means you need to hardcode it). As the name suggests, the purpose of the preformatting stage is to preformat the data and to correct all these flaws before the main parser takes over. For more details and ''hands-on`` instructions, refer to the file \href{https://github.com/Langenscheiss/bibitnow/blob/master/background/preformatters/0_EXAMPLE.js}{''\exmpl``} in
\pathbox{\gitdir/background/preformatters/}
In the above mentioned example of two author names provided in one meta tag, but not in a semicolon-separated list, the \textit{preformatData} function in the preformatting script file would have to contain a line similar to
\begin{center}
\lstset{language=javascript,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
metaData["citation_authors"] = metaData["citation_authors"]
.replace(/[\s]+and[\s]+/gi," ; ");
\end{lstlisting}
\end{center}
in order to correct for this mistake. The \href{https://github.com/Langenscheiss/bibitnow/blob/master/background/preformatters/0_EXAMPLE.js}{example file} illustrates more complex modifications, and \App{app_bibfields} explains which bibfield expects precisely which data structure in the main parser. \textbf{Note carefully} that the same \textbf{rules and restrictions to parsing data as stated in \Sec{sec_4}} also apply to the entire preformatting stage.\par
The preformatting stage is divided into 2 functions that are called in the following sequence:
\begin{enumerate}
\item \textbf{The function \textit{preformatRawData}} is only called if the dynamic citation download request yielded a positive response with valid, non-empty response data. The \textit{citation\_download} property of the \textit{metaData} JSON-object passed to this function then contains the raw response text. In the subsequent parser stage, this text is assumed to represent citation data in the RIS-format!
Hence, if this is not already the case at this stage, you need to reformat the data by modifying it in accordance with the restrictions stated in \Sec{sec_4} (see the \href{https://github.com/Langenscheiss/bibitnow/blob/master/background/preformatters/pubmed.js}{site adjuster for PubMed} as an example of how to deal with this situation!).
\item \textbf{The function \textit{preformatData}} is called in any case. If any data from the dynamic citation download could -- after calling \textit{preformatRawData} -- be successfully parsed, it will be accessible as a JSON object linked to in the \textit{citation\_download} property of the \textit{metaData} object. The bibfields in this object are associated with exactly the same properties as in the \textit{metaData} object itself. For example,
\begin{center}
\lstset{language=javascript,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
metaData["citation_title"]
\end{lstlisting}
\end{center}
contains the title of the citation as obtained from the static data, while
\begin{center}
\lstset{language=javascript,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
metaData["citation_download"]["citation_title"]
\end{lstlisting}
\end{center}
contains the title obtained from the dynamic download request in case the latter was successful. \textbf{Importantly}, after the \textit{preformatData} function has returned (it returns void), any non-empty string in the JSON object linked to in the \textit{citation\_download} property will replace the corresponding static data. In other words, if you, for example, want the plugin to prefer the statically obtained citation title, you will have to add a code similar to
\begin{center}
\lstset{language=javascript,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
if (metaData["citation_title"] != "") {
let download = metaData["citation_download"];
if (download != null
&& typeof(download) == 'object') {
download["citation_title"] = "";
}
}
\end{lstlisting}
\end{center}
to the \textit{preformatData} function in order to erase the dynamically obtained data. Note that since \textit{metaData["citation\_download"]} is only an (empty) string if no data was obtained through a dynamic download, you always need to properly check for that to avoid crashes!
\end{enumerate}
\section{Step 6 -- Debugging and Submission}\label{sec_6}
Once you have written everything, continue testing your site adjuster with various sources on the website of interest. Typically, one overlooks edge cases that need adjustments either in the definition of the preferred selectors, or in the preformatting stage. The more robust your adjuster is, the better.\par
Finally, once everything is ready, use the \href{https://github.com/Langenscheiss/bibitnow}{github repository} to propose a new addition. You do not need to upload a new version of the URL adjuster list. Simply state the URL scheme in the header of your adjuster script files, and the adjuster will be added for the next adjuster upgrade release (which may appear in a different frequency compared to feature updates), given that all criteria for a correct submission are fulfilled.
THANKS!
\appendix
\section{Bibfields}\label{app_bibfields}
In this appendix, we list all bibliography fields (bibfields) accessible via preferred selectors and/or in the preformatting stage. More precisely, subsection \ref{app_bibfields_list} states, for each bibfield, the purpose for the final formatted citation, the format and data type expected or enforced during and at the end of the preformatting stage, and what in the following subsection \ref{app_bibfields_class} is defined as the provided \textit{extraction class}.
The fixed default set of CSS selectors and attributes used to search for bibfield data in the DOM (see \Sec{sec_3}) can be found in the \href{https://github.com/Langenscheiss/bibitnow/blob/master/extractors/prefselectors/0_EXAMPLE.js}{example prefselector file} in the github repository.
\subsection{Extraction class}\label{app_bibfields_class}
Depending on the bibfield and its specific needs and purpose for the final citation, {\plgname} adjusts the precise way in which it uses CSS selectors to extract data from the HTML source. This concerns mainly how much data is extracted (how many search queries are performed), and in which format this data is passed onto the parser/preformatting stage. For example, to obtain \emph{all} author name information, it makes sense to not finish a search query with the first non-empty hit, but to instead search for all elements selected by a particular CSS selector.\par
Altogether, {\plgname} differentiates between bibfields in 4 \textbf{extraction classes}.
\begin{itemize}
\item \textbf{Class 1:} For each bibfield belonging to class 1, the plugin stops the search as initiated by the (default or custom, see \Sec{sec_3}) CSS selectors after the first non-empty string is found. Any bibfield of class 1 supports custom, site-specific CSS selectors defined as described in section \ref{sec_3} and \App{app_prefselector}.
\item \textbf{Class 2a:} For bibfields in class 2a, the plugin searches until the first CSS selector (default or custom) gives a non-empty result, then searches and reads all elements selected by this CSS selector and the associated attribute. A bibfield-dependent maximum number of result strings are then concatenated to one string containing a semicolon-separated list and sent to the preformatting stage/main parser. Just as for class 1, all bibfields which are part of this class support custom CSS selectors, attributes, etc. , including the possibility to reset the maximum number of result strings passed on.
\item \textbf{Class 2b:} The only difference to class 2a is that the search does not stop with the first CSS selector/attribute yielding a non-empty result. Instead, all default and and all custom selectors are queried, and all results are concatenated to a string containing a semicolon-separated list.
\item \textbf{Class 3:} These are additional bibfields which are accessible in the preformatting stage, but for which the data extraction cannot be guided by preferred selectors.
\end{itemize}
Regardless of its class, the extracted string of any bibfield is passed in a sanitized form to the parser/preformatting stage, where the bibfield data is available through the \textit{metaData} object, see \Sec{sec_5}. As detailed in \App{app_sanitize} and \App{app_prefselector}, custom prefselectors give limited control over this sanitizing process.
\subsection{List of bibfields}\label{app_bibfields_list}
Let us now describe each bibfield. The following list is sorted according to the 4 extraction classes defined in \Sec{app_bibfields_class}. For each bibfield, the \textbf{format} and \textbf{content} both in the raw preformatting (RPF) and preformatting (PF) stage (see Sec.~\ref{sec_5}) are stated.
The format is always the \textbf{expected format before entering the next parsing stage}, which for raw preformatting is the preformatting stage, and for the preformatting stage the final parsing stage. If the information contained in the bibfield, i.e., the data accessible via
\begin{center}
\lstset{language=javascript,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
metaData["#BIBFIELD"]
\end{lstlisting}
\end{center}
, with \#BIBFIELD being one of the following bibfields, does not follow the format restrictions expected for the next stage, the system will try to reformat it to the best of its abilities -- by, e.g., removing forbidden characters -- but parsing may fail in those cases. One main purpose of preformatting is thus to ensure that citation data is available to the main parser in the standard format.
\subsubsection{Class 1}
\begin{itemize}
\item citation\_publisher \begin{itemize} \item \textbf{Format:} Single string containing any unicode characters which survive sanitizing \item \textbf{Content (RPF, PF):} Publisher of cited work, e. g., ''Wiley`` or ''Elsevier`` \end{itemize}
\item citation\_journal\_title \begin{itemize} \item \textbf{Format (RPF, PF)} Single string containing any unicode characters which survive sanitizing \item \textbf{Content (RPF, PF):} Title of the journal in which the cited work appears, i.e., ''Science`` or ''Nature``. \end{itemize}
\item citation\_journal\_abbrev \begin{itemize} \item \textbf{Format (RPF, PF)} Single string containing any unicode characters which survive sanitizing \item \textbf{Content (RPF, PF):}
Abbreviation (as obtained from the source!) of the title of the journal in which the cited work appears, i.e., ''Nat. Comm.`` for ''Nature Communications``. If available, the plugin prefers the abbreviation from its own abbreviation database.\end{itemize}
\item citation\_volume \begin{itemize} \item \textbf{Format (RPF, PF)} Single string containing any unicode characters which survive sanitizing \item \textbf{Content (RPF, PF):} Volume (number) of cited source. \end{itemize}
\item citation\_issue \begin{itemize} \item \textbf{Format (RPF, PF)} Single string containing any unicode characters which survive sanitizing \item \textbf{Content (RPF, PF):} Issue (number) of cited source. \end{itemize}
\item citation\_date \begin{itemize} \item \textbf{Format (RPF, PF)} Single string containing any unicode characters which survive sanitizing \item \textbf{Content (RPF, PF):} String that can unambiguously be parsed to a date. Preferrably, but not necessarily, in YYYY-MM-DD format. \end{itemize}
\item citation\_archive\_id \begin{itemize} \item \textbf{Format (RPF, PF)} Single string containing any unicode characters which survive sanitizing \item \textbf{Content (RPF, PF):} Citation ID in open access archive (such as arXiv). \end{itemize}
\item citation\_abstract \begin{itemize} \item \textbf{Format (RPF, PF)} Single string containing any unicode characters which survive sanitizing \item \textbf{Content (RPF, PF):} Abstract of citation. \end{itemize}
\item citation\_issn \begin{itemize} \item \textbf{Format (RPF, PF)} Single string containing the characters ''0-9``,''X``, and ''-`` \item \textbf{Content (RPF, PF):} ISSN of series in which cited source is published. \end{itemize}
\item citation\_isbn \begin{itemize} \item \textbf{Format (RPF, PF)} Single string containing the characters ''0-9``,''X``, and ''-`` \item \textbf{Content (RPF, PF):} ISBN of cited book/thesis. \end{itemize}
\item citation\_firstpage \begin{itemize} \item \textbf{Format (RPF, PF)} Single string containing ASCII characters \item \textbf{Content (RPF, PF):} Indicates first page within the work in which citation is published. \end{itemize}
\item citation\_lastpage \begin{itemize} \item \textbf{Format (RPF, PF)} Single string containing ASCII characters \item \textbf{Content (RPF, PF):} Indicates last page within the work in which citation is published. \end{itemize}
\item citation\_url \begin{itemize} \item \textbf{Format (RPF, PF)} Single string containing URI compatible characters \item \textbf{Content (RPF, PF):} URL of citation source website. By default, this is the URL in the address bar. \end{itemize}
\item citation\_doi \begin{itemize} \item \textbf{Format (RPF, PF)} Single string containing URI compatible characters \item \textbf{Content (RPF, PF):} DOI of citation. \end{itemize}
\item citation\_title \begin{itemize} \item \textbf{Format (RPF, PF)} Single string containing any unicode characters which survive sanitizing \item \textbf{Content (RPF, PF):} Contains the title of cited work, i.e., title of the book, scientific article, thesis, etc. . \end{itemize}
\item citation\_type \begin{itemize} \item \textbf{Format (RPF, PF)} Single string containing any unicode characters which survive sanitizing \item \textbf{Content (RPF, PF):} Contains any keyword that could indicate the type of cited work. After the preformatting stage the system will look for keywords in this bibfield in order to determine whether the source is an article, a book, a thesis, or a generic source (websites etc.). \end{itemize}
\end{itemize}
TODO CONTINUE
\section{Static extraction sanitizer}\label{app_sanitize}
TODO
\section{URL Matching}\label{app_matching}
In this appendix, we first describe the format of \href{https://github.com/Langenscheiss/bibitnow/blob/master/nameResources/urlSpecificAdjusterList.json}{URL specific adjuster list}, located in
\pathbox{\plgdir/nameResources/urlSpecificAdjusterList.json},
and then explain how the URL schemes specified in this list are used for URL matching.
\subsection{URL List Format}\label{app_matching_list}
The URL list is essentially a single array of JSON objects, each corresponding to a URL scheme that is potentially matched to the URL of the website on which the plugin is launched. Each object contains up to 5 valid properties -- ''scheme``, ''top``, ''path``, ''prefselector`` and ''preformatter`` -- to which either a string or, in some cases, another array of JSON objects can be assigned. An example of such a JSON object that {\plgname } uses in order to recognize and load site adjusters for the ''Science`` Journals is
\begin{center}
\lstset{language=javascript,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
{ "scheme": "(?:|[0-9a-z\\-]+[\\.]+)sciencemag" ,
"top": "org" ,
"prefselector": "science" , "preformatter": "science" },
\end{lstlisting}
\end{center}
Let us describe the properties in detail:
\begin{enumerate}
\item \textbf{Property:} ''scheme`` -- \textbf{Type:} \textit{String} -- \textbf{Required:} yes -- \textbf{Description:} State the domain, without top-level domain, that should be recognized by the URL matching system. As further explained below, the assigned string is embedded into a regular expression, meaning that you may also include properly escaped regular expressions into the scheme. \textbf{Never forget to escape} characters with a particular function in regexp, such as ''.`` and ''/``.
Importantly, escapes require \textbf{two} instead of just one extra backslash: one for escaping the backslash character in the JSON string, and the resulting backslash to escape the following character in the regular expression into which the scheme is embedded.
\textbf{NOTE:} While there is no inbuilt technical restriction, regexp-only schemes which match to any domain will not be recognized as valid contributions, and will \textbf{not be tolerated!}
A scheme must match to a specific domain, but you may allow for multiple subdomains, such as in the above stated ''Science`` example.
\item \textbf{Property:} ''top`` -- \textbf{Type:} \textit{String} or \textit{Array} -- \textbf{Required:} yes -- \textbf{Description:} State the top-level domain(s) that should be recognized by the URL matching system. If a \textit{string} is provided, and if the found top-level domain is NOT matched precisely by the provided string, the whole matching procedure terminates without returning a valid site adjuster. Note, however, that the provided string is again embedded into a regular expression, in order to improve compatibility to websites which exist under many top-level domains.
Regexp-only matching is tolerated if it makes sense, i.e., if the domain exists under many different top-level domains (see Google adjuster as an example).\par
If an array is provided, each element of this array must be another JSON object. This object must have a ''scheme`` property in the format described in point 1, and can furthermore have a ''path``,''prefselector`` and ''preformatter`` property, in the same format as described below in points 3, 4, and 5. In this case, the ''scheme`` string is the string specifying the top-level domain as stated above.
The search for top-level domains either stops at the first array element with a positively matched scheme, or terminates the entire matching procedure without returning a site adjuster if no top-level domain scheme matches.\par
The purpose of providing an array with several such objects is to let the URL matcher choose different prefselectors and preformatters for different top-level domains. One example from the source where this is heavily used is the Amazon store. As the code excerpt below demonstrates, only one prefselector is necessary, but each top-level domain results in choosing a different preformatter that is adjusted to the language used on the webpage of the particular top-level domain.
\begin{center}
\lstset{language=javascript,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
{ "scheme": "amazon" , "prefselector": "amazon" ,
"top": [
{ "scheme": "com" , "preformatter": "amazon-com" },
{ "scheme": "de" , "preformatter": "amazon-de" },
{ "scheme": "it" , "preformatter": "amazon-it" },
{ "scheme": "fr" , "preformatter": "amazon-fr" },
...
]
},
\end{lstlisting}
\end{center}
\item \textbf{Property:} ''path`` -- \textbf{Type:} \textit{String} or \textit{Array} -- \textbf{Required:} no -- \textbf{Description:} State the URL path that should be recognized by the URL matching system. If a \textit{string} is provided, the matching is only positive if the \textbf{beginning} of the found path, i.e., the part following the ''/`` after the top-level domain up to some character, is matched by the provided string. If the provided path string does not match, the entire matching procedure terminates without returning a valid site adjuster.
Again, note that the path scheme may be embedded into a regular expression. The rules for contributing are the same as for the ''scheme`` property.\par
Just as for the ''top`` property, assigning instead an array to ''path`` requires that each element of this array must be another JSON object. This object must have a ''scheme`` property in the format described under point 1, and can furthermore have a ''prefselector`` and ''preformatter`` property, in the same format as described below in points 4 and 5; the ''top`` property is not valid in this case.
The ''scheme`` string is the string specifying the path as explained above. If such an array of paths is assigned, a positive match is only obtained if at least one provided path scheme does match in the way described above. After the first positive match, the search stops. If no scheme matches, the matching procedure terminates without return a valid site adjuster.\par
The purpose of assigning an array to ''path`` is the same as for ''top``: to choose different prefselectors/preformatters for different paths. An example from the code where this is used is the ''ScienceDirect`` portal, where different adjusters are applied for journal articles and books:
\begin{center}
\lstset{language=javascript,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
{ "scheme": "sciencedirect" , "top": "com" ,
"path": [
{ "scheme": "science\\/book" ,
"prefselector": "sciencedirect-book" ,
"preformatter": "sciencedirect-book" },
{ "scheme": "science\\/article" ,
"prefselector": "sciencedirect" ,
"preformatter": "sciencedirect" }
]
},
\end{lstlisting}
\end{center}
\item \textbf{Property:} ''prefselector`` -- \textbf{Type:} \textit{String} -- \textbf{Required:} no -- \textbf{Description:} State the name of the prefselector javascript file, relative to
\pathbox{\plgdir/extractors/prefselectors/}
and without the ''.js`` file extension.
\item \textbf{Property:} ''preformatter`` -- \textbf{Type:} \textit{String} -- \textbf{Required:} no -- \textbf{Description:} State the name of the preformatter javascript file, relative to
\pathbox{\plgdir/background/preformatters/}
and without the ''.js`` file extension.
\end{enumerate}
\subsection{Prefselector/Preformatter preference}
If any JSON object in the list specifies multiple prefselector/preformatter files in different properties, the final one is chosen according to the following preference list (1 = highest, 3 = lowest preference):
\begin{enumerate}
\item ''prefselector``/''preformatter`` property of an object in an array assigned to the ''path`` property. This ''path`` property can itself be either from the base object in the adjuster list, or in an object inside an array assigned to the ''top`` property.
\item ''prefselector``/''preformatter`` property of an object in an array assigned to the ''top`` property.
\item ''prefselector``/''preformatter`` property of the base object in the adjuster list.
\end{enumerate}
\subsection{Matching procedure}
The URL is matched as follows. Prior to the matching, the protocol scheme ''http://`` or ''https://`` (actually called ''scheme``, but here to be distinguished from the above explained ''scheme`` property!) and any ''www`` subdomain following right after the protocol scheme are removed from obtained URL. The remaining part of the URL is separated into a domain, top-level domain, and a path. The domain contains all characters from the beginning up to (but not including) the last period separating the domain from the top-level domain.
The top-level domain contains all characters after this period up to the first ''/`` separating the path from the domain, or up to the end of the URL if no path is included. Every character after the first ''/`` belongs to the path.\par
The domain-scheme specified in the adjuster list must match \textbf{the entire domain} obtained as described in the previous paragraph. To match multiple subdomains, you may use regular expressions, see previous \Sec{app_matching_list}. The same as for the domain also holds for the top-level domain. For the path, the scheme obtained from the adjuster list is matched from the beginning of the path. A positive match hence only requires that the obtained path \textbf{begins} with the specified path-scheme.
\section{Format of prefselectors}\label{app_prefselector}
As pointed out in section 3, the first crucial ingredient of a site adjuster are preferred selectors, prefselectors. They are entered as properties of the JSON object ''prefselectorMsg`` defined in the prefselector file:
\begin{center}
\lstset{language=javascript,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
var prefselectorMsg = {
...
#BIBFIELD : [#PREFSELECTOR_1, #PREFSELECTOR_2,...],
...
}
\end{lstlisting}
\end{center}
where \#BIBFIELD is one of the bibliography fields described in \App{app_bibfields}, and each preferred selector \#PREFSELECTOR$_i$ itself is an array of several required and several optional arguments:
\begin{center}
\lstset{language=javascript,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
[
#CSS_SELECTORS , #ATTRIBUTE_TO_READ ,
#ALLOW_MULTIPLE_LINES , #MAXIMUM_NUMBER_OF_CHARS ,
#ALLOW_HTML_TAGS , #MAXIMUM_NUMBER_OF_HITS
]
\end{lstlisting}
\end{center}
The following example of a full, valid ''prefselectorMsg`` is taken from the ''APS Journals`` site adjuster (State: February 2018):
\begin{center}
\lstset{language=javascript,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
var prefselectorMsg = {
citation_issn: [ ['p.legal','innerText'] ],
citation_download: [ ['a#export-article-link','href'] ,
['BINURL','']
],
citation_abstract: [
['div#article-content section.abstract div.content p',
'innerText', true, 20000] ,
['meta[name="description"]','content', true, 20000]
],
citation_author: [
['meta[name="citation_author"]','content'] ,
['div#title h5.authors','innerText']
],
citation_keywords: [
['div.physh-tagging a.physh-concept','innerText']
]
};
\end{lstlisting}
\end{center}
\subsection{Prefselector arguments}
Let us describe the arguments in detail.
\begin{enumerate}
\item \textbf{Argument:} \#CSS\_SELECTORS -- \textbf{Type:} \textit{String} -- \textbf{Required:} yes -- \textbf{Description:} String specifying the \href{https://www.w3schools.com/cssref/css_selectors.asp}{CSS selector(s)} used to select the HTML tag(s) from which the data for the \#BIBFIELD should be obtained. For \#BIBFIELDS from class 1 defined in \App{app_bibfields}, only the first positive, i.e., non-empty result for the \#ATTRIBUTE\_TO\_READ obtained by ANY of the specified selectors will be selected. In class 2, all non-empty results from all selectors will be taken into account. Examples are
\begin{center}
\lstset{language=javascript,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
'meta[name="citation_author"]'
'a.citLink'
'div#title span.titleText'
\end{lstlisting}
\end{center}
\item \textbf{Argument:} \#ATTRIBUTE\_TO\_READ -- \textbf{Type:} \textit{String} -- \textbf{Required:} yes -- \textbf{Description:} Attribute of the selected HTML tag(s) that should be read as the data for the \#BIBFIELD. This attribute could be any standard or non-standard attribute of the selected HTML tag(s). Furthermore, you may set \#ATTRIBUTE\_TO\_READ to either ``innerText'' or ``textContent'' in order to read the content \textbf{IN BETWEEN} the tags,
\begin{center}
\lstset{language=Html,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
<tag>#content</tag>
\end{lstlisting}
\end{center}
, via the \textbf{``innerText''} or \textbf{``textContent''} property.
The difference between the two properties is explained, e.g., \href{https://stackoverflow.com/questions/35213147/difference-between-text-content-vs-inner-text}{at Stack Overflow}.
You can ignore all the warnings about ``innerText'' not being standard, too slow, or not being supported. It is supported for all browsers for which {\plgname} is developed, works more than fast enough for the desired performance of the plugin (=there is no loop over millions of calls of innerText), and has its legitimate purpose. In fact, most site adjusters of {\plgname} use ''innerText``, as it comes closer to what the publisher intends to show on its website. Examples are
\begin{center}
\lstset{language=javascript,showstringspaces=false,backgroundcolor=\color{light-blue}}
\begin{lstlisting}
'content'
'href'
'innerText'
\end{lstlisting}
\end{center}
\item \textbf{Argument:} \#ALLOW\_MULTIPLE\_LINES -- \textbf{Type:} \textit{Boolean} -- \textbf{Required:} no -- \textbf{Default:} \textit{false} -- \textbf{Description:} If \textit{false}, any data string retrieved by setting \#CSS\_SELECTORS and \#ATTRIBUTE\_TO\_READ is trimmed after the first occurrence of a newline character, to avoid passing too much data to the parser! If explicitly set to \textit{true}, any newline character is converted to a simple white space character.
The above given excerpt from the ''APS Journals`` prefselector shows that a typical \#BIBFIELD for which it makes sense to set this argument to \textit{true} is the article's abstract, which may very well contain such newline characters.
\item \textbf{Argument:} \#MAXIMUM\_NUMBER\_OF\_CHARS -- \textbf{Type:} \textit{Positive Integer} -- \textbf{Required:} no -- \textbf{Default:} $1024$ -- \textbf{Description:} Sets the maximum number of characters of the string retrieved by \#CSS\_SELECTORS and \#ATTRIBUTE\_TO\_READ that should be passed to the parser. By default, this number is set to $1024$, in order to avoid passing too much data to the parser.
The above given excerpt from the ''APS Journals`` prefselector shows that a typical \#BIBFIELD for which it makes sense to set a number different from $1024$ is the article's abstract, which may very well contain more characters.
\item \textbf{Argument:} \#ALLOW\_HTML\_TAGS -- \textbf{Type:} \textit{Boolean} -- \textbf{Required:} no -- \textbf{Default:} \textit{false} -- \textbf{Description:} If \textit{false}, any HTML Tag found in data string retrieved by \#CSS\_SELECTORS and \#ATTRIBUTE\_TO\_READ is removed. Note that reading either the ''innerText`` or ''textContent`` property itself always involves an inbuilt HTML Tag cleanup, regardless of whether \#ALLOW\_HTML\_TAGS is set to \textit{true} or \textit{false}.
The cleanup performed by {\plgname} is an independent cleanup that is also applied when reading from attributes other than ''innerText`` or ''textContent``. The purpose of this cleanup is to minimize the risk of websites sneaking in script tags into the formatted citation data which could be malicious if they were parsed using ''eval`` or the ''innerHTML`` property.
\item \textbf{Argument:} \#MAXIMUM\_NUMBER\_OF\_HITS -- \textbf{Type:} \textit{Positive Integer} -- \textbf{Required:} no -- \textbf{Default:} depends on \#BIBFIELD, see \App{app_bibfields} -- \textbf{Description:} Sets the maximum number of non-empty readings passed to the parser for any \#BIBFIELD from class 2. For example, the number of found tags with author information is, by default, capped at $10000$.
In general, such upper limits are set to establish a balance between papers having a large set of valid meta information (e.g., hyper authoring with several $1000$ authors) and a protection against (malicious) websites containing too many invisible meta tags.
\end{enumerate}
% \#ALLOW_MULTIPLE_LINES = Boolean, optional. Default = false.
% If false, any data string retrieved by setting \#CSS_SELECTORS and \#ATTRIBUTE_TO_READ is trimmed after the first occurence of a newline character! If true, any newline character is converted to a simple white space character.
%
% \#MAXIMUM_NUMBER_OF_CHARS = non-negative integer, optional. Default = 1024.
\end{document}