From a9ce9314a2c4bf7b8449b00e8ba25522a6342eba Mon Sep 17 00:00:00 2001 From: Suresh Hewapathirana Date: Fri, 9 Apr 2021 11:22:55 +0100 Subject: [PATCH] schema file from resource folder --- pom.xml | 9 +- .../validators/MzIdValidator.java | 9 +- src/main/resources/mzIdentML1.1.0.xsd | 1845 +++++++++++++++++ 3 files changed, 1856 insertions(+), 7 deletions(-) create mode 100644 src/main/resources/mzIdentML1.1.0.xsd diff --git a/pom.xml b/pom.xml index 9710738..148e77e 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,6 @@ UTF-8 1.0.2 2.0.28 - 2.0.31-SNAPSHOT 1.2.0 2.3.1 1.3.15 @@ -34,10 +33,6 @@ org.slf4j slf4j-api - - - - ch.qos.logback logback-core @@ -180,6 +175,10 @@ + + org.apache.maven.plugins + maven-resources-plugin + org.apache.maven.plugins maven-jar-plugin diff --git a/src/main/java/uk/ac/ebi/pride/toolsuite/px_validator/validators/MzIdValidator.java b/src/main/java/uk/ac/ebi/pride/toolsuite/px_validator/validators/MzIdValidator.java index cc8c431..f099ede 100644 --- a/src/main/java/uk/ac/ebi/pride/toolsuite/px_validator/validators/MzIdValidator.java +++ b/src/main/java/uk/ac/ebi/pride/toolsuite/px_validator/validators/MzIdValidator.java @@ -15,6 +15,7 @@ import java.io.*; import java.net.URI; import java.net.URISyntaxException; +import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -24,7 +25,7 @@ public class MzIdValidator implements Validator{ private File file; private List peakFiles; - private static final String MZID_SCHEMA = "https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/psi-pi/mzIdentML1.1.0.xsd"; +// private static final String MZID_SCHEMA = "https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/psi-pi/mzIdentML1.1.0.xsd"; public static Validator getInstance(CommandLine cmd) throws Exception { return new MzIdValidator(cmd); @@ -86,7 +87,11 @@ private static IReport validateMzidSchema(File mzIdentML) { IReport report = new ResultReport(); try (BufferedReader br = new BufferedReader(new FileReader(mzIdentML))) { GenericSchemaValidator genericValidator = new GenericSchemaValidator(); - genericValidator.setSchema(new URI(MzIdValidator.MZID_SCHEMA)); + URL url = MzIdValidator.class.getClassLoader().getResource("mzIdentML1.1.0.xsd"); + if (url == null || url.getPath().length() == 0) { + throw new IllegalStateException("MzIdentML1.1.0.xsd not found!"); + } + genericValidator.setSchema(url.toURI()); ErrorHandlerIface handler = new ValidationErrorHandler(); genericValidator.setErrorHandler(handler); genericValidator.validate(br); diff --git a/src/main/resources/mzIdentML1.1.0.xsd b/src/main/resources/mzIdentML1.1.0.xsd new file mode 100644 index 0000000..1e75a87 --- /dev/null +++ b/src/main/resources/mzIdentML1.1.0.xsdhe list of controlled vocabularies used in the file. + + + + + + + + The software packages used to perform the analyses. + + + + + + + + + The samples analysed can optionally be recorded using CV terms for descriptions. If a composite sample has been analysed, the subsample association can be used to build a hierarchical description. + + + + + + + + The collection of sequences (DBSequence or Peptide) identified and their relationship between each other (PeptideEvidence) to be referenced elsewhere in the results. + + + + + + + + + + The analyses performed to get the results, which map the input and output data sets. Analyses are for example: SpectrumIdentification (resulting in peptides) or ProteinDetection (assemble proteins from peptides). + + + + + + + + + The collection of protocols which include the parameters and settings of the performed analyses. + + + + + + + + + The inputs to the analyses including the databases searched, the spectral data and the source file converted to mzIdentML. + + + + + + + + + + Data sets generated by the analyses, including peptide and protein lists. + + + + + + + + + The collection of input and output data sets of the analyses. + + + + + + + + + + The upper-most hierarchy level of mzIdentML with sub-containers for example describing software, protocols and search results (spectrum identifications or protein detection results). + + + + + + + + + The Provider of the mzIdentML record in terms of the contact and software. + + + + + + + + + + + Any bibliographic references associated with the file + + + + + + The date on which the file was produced. + + + + + The version of the schema this instance document refers to, in the format x.y.z. Changes to z should not affect prevent instance documents from validating. + + + + + + + + A database for searching mass spectra. Examples include a set of amino acid sequence entries, or annotated spectra libraries. + + + + + + + The database name may be given as a cvParam if it maps exactly to one of the release databases listed in the CV, otherwise a userParam should be used. + + + + + + + The version of the database. + + + + + The date and time the database was released to the public; omit this attribute when the date and time are unknown or not applicable (e.g. custom databases). + + + + + The total number of sequences in the database. + + + + + The number of residues in the database. + + + + + + + + A file from which this mzIdentML instance was created. + + + + + + + Any additional parameters description the source + file. + + + + + + + + + + The specification of static/variable modifications (e.g. Oxidation of Methionine) that are to be considered in the spectra search. + + + + + + + + Filters applied to the search database. The filter must include at least one of Include and Exclude. If both are used, it is assumed that inclusion is performed first. + + + + + The type of filter e.g. database taxonomy filter, pi filter, mw filter + + + + + All sequences fulfilling the specifed criteria are included. + + + + + All sequences fulfilling the specifed criteria are excluded. + + + + + + + The specification of filters applied to the database searched. + + + + + + + + + The table used to translate codons into nucleic acids e.g. by reference to the NCBI translation table. + + + + + + + The details specifying this translation table are captured as cvParams, e.g. translation table, translation start codons and translation table description (see specification document and mapping file) + + + + + + + + + A specification of how a nucleic acid sequence database was translated for searching. + + + + + + + The frames in which the nucleic acid sequence has been translated as a space separated list + + + + + + The parameters and settings of a SpectrumIdentification analysis. + + + + + + + The type of search performed e.g. PMF, Tag searches, MS-MS + + + + + The search parameters other than the modifications searched. + + + + + + + + + + The threshold(s) applied to determine that a result is significant. If multiple terms are used it is assumed that all conditions are satisfied by the passing results. + + + + + + + + The search algorithm used, given as a reference to the SoftwareCollection section. + + + + + + + + The attribute referencing an identifier within the SpectraData section. + + + + A reference to the SpectraData element which locates the input spectra to an external file. + + + + + + One of the search databases used. + + + + A reference to the database searched. + + + + + + An Analysis which tries to identify peptides in input spectra, referencing the database searched, the input spectra, the output results and the protocol that is run. + + + + + + + One of the spectra data sets used. + + + + + + + A reference to the search protocol used for this SpectrumIdentification. + + + + + A reference to the SpectrumIdentificationList produced by this analysis in the DataCollection section. + + + + + + + + References to CV terms defining the measures about product ions to be reported in SpectrumIdentificationItem + + + + + + + + + + + + Contains the types of measures that will be reported in generic arrays for each SpectrumIdentificationItem e.g. product ion m/z, product ion intensity, product ion m/z error + + + + + + + + Represents the set of all search results from SpectrumIdentification. + + + + + + + + + Scores or output parameters associated with the SpectrumIdentificationList. + + + + + + The number of database sequences searched against. This value should be provided unless a de novo search has been performed. + + + + + + + + The specificity rules of the searched modification including for example the probability of a modification's presence or peptide or protein termini. Standard fixed or variable status should be provided by the attribute fixedMod. + + + + + + + + Specification of a search modification as parameter for a spectra search. Contains the name of the modification, the mass, the specificity and whether it is a static modification. + + + + + + The modification is uniquely identified by references to external CVs such as UNIMOD, see specification document and mapping file for more details. + + + + + + True, if the modification is static (i.e. occurs always). + + + + + The mass delta of the searched modification in Daltons. + + + + + The residue(s) searched with the specified modification. For N or C terminal modifications that can occur on any residue, the . character should be used to specify any, otherwise the list of amino acids should be provided. + + + + + + An array of values for a given type of measure and for a particular ion type, in parallel to the index of ions identified. + + + + The values of this particular measure, corresponding to the index defined in ion type + + + + + A reference to the Measure defined in the FragmentationTable + + + + + + IonType defines the index of fragmentation ions being reported, importing a CV term for the type of ion e.g. b ion. Example: if b3 b7 b8 and b10 have been identified, the index attribute will contain 3 7 8 10, and the corresponding values will be reported in parallel arrays below + + + + + + The type of ion identified. + + + + + + The index of ions identified as integers, following standard notation for a-c, x-z e.g. if b3 b5 and b6 have been identified, the index would store "3 5 6". For internal ions, the index contains pairs defining the start and end point - see specification document for examples. For immonium ions, the index is the position of the identified ion within the peptide sequence - if the peptide contains the same amino acid in multiple positions that cannot be distinguished, all positions should be given. + + + + + The charge of the identified fragmentation ions. + + + + + + The product ions identified in this result. + + + + + + + + Reference to the PeptideEvidence element identified. If a specific sequence can be assigned to multiple proteins and or positions in a protein all possible PeptideEvidence elements should be referenced here. + + + + A reference to the PeptideEvidenceItem element(s). + + + + + + An identification of a single (poly)peptide, resulting from querying an input spectra, along with the set of confidence values for that identification. +PeptideEvidence elements should be given for all mappings of the corresponding Peptide sequence within protein sequences. + + + + + + + + + Scores or attributes associated with the SpectrumIdentificationItem e.g. e-value, p-value, score. + + + + + + The charge state of the identified peptide. + + + + + The mass-to-charge value measured in the experiment in Daltons / charge. + + + + + The theoretical mass-to-charge value calculated for the peptide in Daltons / charge. + + + + + The calculated isoelectric point of the (poly)peptide, with relevant modifications included. Do not supply this value if the PI cannot be calcuated properly. + + + + + A reference to the identified (poly)peptide sequence in the Peptide element. + + + + + For an MS/MS result set, this is the rank of the identification quality as scored by the search engine. 1 is the top rank. If multiple identifications have the same top score, they should all be assigned rank =1. For PMF data, the rank attribute may be meaningless and values of rank = 0 should be given. + + + + + Set to true if the producers of the file has deemed that the identification has passed a given threshold or been validated as correct. If no such threshold has been set, value of true should be given for all results. + + + + + A reference should be given to the MassTable used to calculate the sequenceMass only if more than one MassTable has been given. + + + + + A reference should be provided to link the SpectrumIdentificationItem to a Sample if more than one sample has been described in the AnalysisSampleCollection. + + + + + + + + All identifications made from searching one spectrum. For PMF data, all peptide identifications will be listed underneath as SpectrumIdentificationItems. For MS/MS data, there will be ranked SpectrumIdentificationItems corresponding to possible different peptide IDs. + + + + + + + + Scores or parameters associated with the SpectrumIdentificationResult (i.e the set of SpectrumIdentificationItems derived from one spectrum) e.g. the number of peptide sequences within the parent tolerance for this spectrum. + + + + + + The locally unique id for the spectrum in the spectra data set specified by SpectraData_ref. External guidelines are provided on the use of consistent identifiers for spectra in different external formats. + + + + + A reference to a spectra data set (e.g. a spectra file). + + + + + + + + The lists of spectrum identifications that are input to the protein detection process. + + + + A reference to the list of spectrum identifications that were input to the process. + + + + + + An Analysis which assembles a set of peptides (e.g. from a spectra search analysis) to proteins. + + + + + + + + + A reference to the ProteinDetectionList in the DataCollection section. + + + + + A reference to the detection protocol used for this ProteinDetection. + + + + + + + + The parameters and settings of a ProteinDetection process. + + + + + + + The parameters and settings for the protein detection given as CV terms. + + + + + The threshold(s) applied to determine that a result is significant. If multiple terms are used it is assumed that all conditions are satisfied by the passing results. + + + + + + The protein detection software used, given as a reference to the SoftwareCollection section. + + + + + + + + The protein list resulting from a protein detection process. + + + + + + + + Scores or output parameters associated with the whole ProteinDetectionList + + + + + + + + + Reference(s) to the SpectrumIdentificationItem element(s) that support the given PeptideEvidence element. Using these references it is possible to indicate which spectra were actually accepted as evidence for this peptide identification in the given protein. + + + + A reference to the SpectrumIdentificationItem element(s). + + + + + + Peptide evidence on which this ProteinHypothesis is based by reference to a PeptideEvidence element. + + + + + + + A reference to the PeptideEvidence element on which this hypothesis is based. + + + + + + A single result of the ProteinDetection analysis (i.e. a protein). + + + + + + + + Scores or parameters associated with this ProteinDetectionHypothesis e.g. p-value + + + + + + A reference to the corresponding DBSequence entry. This optional and redundant, because the PeptideEvidence elements referenced from here also map to the DBSequence. + + + + + Set to true if the producers of the file has deemed that the ProteinDetectionHypothesis has passed a given threshold or been validated as correct. If no such threshold has been set, value of true should be given for all results. + + + + + + + + A set of logically related results from a protein detection, for example to represent conflicting assignments of peptides to proteins. + + + + + + + + + Scores or parameters associated with the ProteinAmbiguityGroup. + + + + + + + + + A molecule modification specification. If n modifications have been found on a peptide, there should be n instances of Modification. If multiple modifications are provided as cvParams, it is assumed that the modification is ambiguous i.e. one modification or another. A cvParam must be provided with the identification of the modification sourced from a suitable CV e.g. UNIMOD. If the modification is not present in the CV (and this will be checked by the semantic validator within a given tolerance window), there is a “unknown modification” CV term that must be used instead. A neutral loss should be defined as an additional CVParam within Modification. If more complex information should be given about neutral losses (such as presence/absence on particular product ions), this can additionally be encoded within the FragmentationArray. + + + + + CV terms capturing the modification, sourced from an appropriate controlled vocabulary. + + + + + + Location of the modification within the peptide - position in peptide sequence, counted from the N-terminus residue, starting at position 1. Specific modifications to the N-terminus should be given the location 0. Modification to the C-terminus should be given as peptide length + 1. If the modification location is unknown e.g. for PMF data, this attribute should be omitted. + + + + + Specification of the residue (amino acid) on which the modification occurs. If multiple values are given, it is assumed that the exact residue modified is unknown i.e. the modification is to ONE of the residues listed. Multiple residues would usually only be specified for PMF data. + + + + + + Atomic mass delta considering the natural distribution of isotopes in Daltons. + + + + + Atomic mass delta when assuming only the most common isotope of elements in Daltons. + + + + + + One (poly)peptide (a sequence with modifications). The combination of Peptide sequence and modifications must be unique in the file. + + + + + + + The amino acid sequence of the (poly)peptide. If a substitution modification has been found, the original sequence +should be reported. + + + + + + + Additional descriptors of this peptide sequence + + + + + + + + + A modification where one residue is substituted by another (amino acid change). + + + + The original residue before replacement. + + + + + + + + + + The residue that replaced the originalResidue. + + + + + + + + + + Location of the modification within the peptide - position in peptide sequence, counted from the N-terminus residue, starting at position 1. +Specific modifications to the N-terminus should be given the location 0. +Modification to the C-terminus should be given as peptide length + 1. + + + + + Atomic mass delta considering the natural distribution of isotopes in Daltons. This should only be reported if the original amino acid is known i.e. it is not "X" + + + + + Atomic mass delta when assuming only the most common isotope of elements in Daltons. This should only be reported if the original amino acid is known i.e. it is not "X" + + + + + + A data set containing spectra data (consisting of one or more spectra). + + + + + + + + + + + + The software used for performing the analyses. + + + + + + + The contact details of the organisation or person that produced the software + + + + + The name of the analysis software package, sourced from a CV if available. + + + + + Any customizations to the software, such as alternative scoring mechanisms implemented, should be documented here as free text. + + + + + + The version of Software used. + + + + + URI of the analysis software e.g. manufacturer's website + + + + + + + + The details of an individual cleavage enzyme should be provided by giving a regular expression or a CV term if a "standard" enzyme cleavage has been performed. + + + + + + + Regular expression for specifying the enzyme cleavage site. + + + + + The name of the enzyme from a CV. + + + + + + Element formula gained at NTerm. + + + + + + + + + + Element formula gained at CTerm. + + + + + + + + + + Set to true if the enzyme cleaves semi-specifically (i.e. one terminus must cleave according to the rules, the other can cleave at any residue), false if the enzyme cleavage is assumed to be specific to both termini (accepting for any missed cleavages). + + + + + The number of missed cleavage sites allowed by the search. The attribute must be provided if an enzyme has been used. + + + + + Minimal distance for another cleavage (minimum: 1). + + + + + + + + + + + + + The list of enzymes used in experiment + + + + + + + If there are multiple enzymes specified, this attribute is set to true if cleavage with different enzymes is performed independently. + + + + + + + The single letter code for the residue. + + + + + The residue mass in Daltons (not including any fixed modifications). + + + + + + Ambiguous residues e.g. X can be specified by the Code attribute and a set of parameters for example giving the different masses that will be used in the search. + + + + + Parameters for capturing e.g. "alternate single letter codes" + + + + + + The single letter code of the ambiguous residue e.g. X. + + + + + + The masses of residues used in the search. + + + + + + + The specification of a single residue within the mass table. + + + + + + Additional parameters or descriptors for the MassTable. + + + + + + The MS spectrum that the MassTable refers to e.g. "1" for MS1 "2" for MS2 or "1 2" for MS1 or MS2. + + + + + + + + PeptideEvidence links a specific Peptide element to a specific position in a DBSequence. There must only be one PeptideEvidence item per Peptide-to-DBSequence-position. + + + + + + + Additional parameters or descriptors for the PeptideEvidence. + + + + + + A reference to the protein sequence in which the specified peptide has been linked. + + + + + A reference to the identified (poly)peptide sequence in the Peptide element. + + + + + Start position of the peptide inside the protein sequence, where the first amino acid of the protein sequence is position 1. Must be provided unless this is a de novo search. + + + + + The index position of the last amino acid of the peptide inside the protein sequence, where the first amino acid of the protein sequence is position 1. Must be provided unless this is a de novo search. + + + + + Previous flanking residue. If the peptide is N-terminal, pre="-" and not pre="". If for any reason it is unknown (e.g. denovo), pre="?" should be used. + + + + + + + + + + Post flanking residue. If the peptide is C-terminal, post="-" and not post="". If for any reason it is unknown (e.g. denovo), post="?" should be used. + + + + + + + + + + A reference to the translation table used if this is PeptideEvidence derived from nucleic acid sequence + + + + + The translation frame of this sequence if this is PeptideEvidence derived from nucleic acid sequence + + + + + Set to true if the peptide is matched to a decoy sequence. + + + + + + + + The tolerance of the search given as a plus and minus value with units. + + + + + CV terms capturing the tolerance plus and minus values. + + + + + + + The format of the spectrum identifier within the source file + + + + + CV term capturing the type of identifier used. + + + + + + + A database sequence from the specified SearchDatabase (nucleic acid or amino acid). If the sequence is nucleic acid, the source nucleic acid sequence +should be given in the seq attribute rather than a translated sequence. + + + + + + + The actual sequence of amino acids or nucleic acid. + + + + + Additional descriptors for the sequence, such as taxon, description line etc. + + + + + + The length of the sequence as a number of bases or residues. + + + + + The source database of this sequence. + + + + + The unique accession of this sequence. + + + + + + + + A description of the sample analysed by mass spectrometry using CVParams or UserParams. If a composite sample has been analysed, a parent sample should be defined, which references subsamples. This represents any kind of substance used in an experimental workflow, such as whole organisms, cells, DNA, solutions, compounds and experimental substances (gels, arrays etc.). + + + + + + + Contact details for the Material. The association to ContactRole could specify, for example, the creator or provider of the Material. + + + + + + The characteristics of a +Material. + + + + + + + + + References to the individual component samples within a mixed parent sample. + + + + A reference to the child sample. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Data external to the XML instance document. The location of the data file is given in the location attribute. + + + + + + + A URI to access documentation and tools to interpret the external format of the ExternalData instance. For example, XML Schema or static libraries (APIs) to access binary formats. + + + + + + + The location of the data file. + + + + + + + + The format of the ExternalData file, for example "tiff" for image files. + + + + + cvParam capturing file formats + + + + + + + A person's name and contact details. Any additional information such as the address, contact email etc. should be supplied using CV parameters or user parameters. + + + + + + + The organization a person belongs to. + + + + + + The Person's last/family name. + + + + + The Person's first name. + + + + + The Person's middle initial. + + + + + + + + + A reference to the organization this contact belongs to. + + + + + + Organizations are entities like companies, universities, government agencies. Any additional information such as the address, email etc. should be supplied either as CV parameters or as user parameters. + + + + + + + + + + + + The containing organization (the university or business which a lab belongs to, etc.) + + + + A reference to the organization this contact belongs to. + + + + + + A contact is either a person or an organization. + + + + + + + Attributes of this contact such as address, email, telephone etc. + + + + + + + + + The role that a Contact plays in an organization or with respect to the associating class. A Contact may have several Roles within scope, and as such, +associations to ContactRole allow the use of a Contact in a certain manner. Examples +might include a provider, or a data analyst. + + + + + + + When a ContactRole is used, it specifies which Contact the role is associated with. + + + + + + The roles (lab equipment sales, contractor, etc.) the Contact fills. + + + + + + CV term for contact roles, such as software provider. + + + + + + + Represents bibliographic references. + + + + + + The names of the authors of the reference. + + + + + The name of the journal, book etc. + + + + + The publisher of the publication. + + + + + The editor(s) of the reference. + + + + + The year of publication. + + + + + The volume name or number. + + + + + The issue name or number. + + + + + The page numbers. + + + + + The title of the BibliographicReference. + + + + + The DOI of the referenced publication. + + + + + + + + The use of a protocol with the requisite Parameters and ParameterValues. ProtocolApplications can take Material or Data (or both) as input +and produce Material or Data (or both) as output. + + + + + + When the protocol was applied. + + + + + + + + Abstract entity allowing either cvParam or userParam to be referenced in other schemas. + + + + The name of the parameter. + + + + + The user-entered value of the parameter. + + + + + An accession number identifying the unit within the OBO foundry Unit CV. + + + + + The name of the unit. + + + + + If a unit term is referenced, this attribute must refer to the CV 'id' attribute defined in the cvList in this file. + + + + + + A single user-defined parameter. + + + + + + The datatype of the parameter, where appropriate (e.g.: xsd:float). + + + + + + + + A single entry from an ontology or a controlled +vocabulary. + + + + + + A reference to the cv element from which this term originates. + + + + + The accession or ID number of this CV term in the source CV. + + + + + + + + A source controlled vocabulary from which cvParams will be obtained. + + + + The full name of the CV. + + + + + The version of the CV. + + + + + The URI of the source CV. + + + + + The unique identifier of this cv within the document to be referenced by cvParam elements. + + + + + + Other classes in the model can be specified as sub-classes, inheriting from Identifiable. Identifiable gives classes a unique identifier within the scope and a name that need not be unique. + + + + An identifier is an unambiguous string that is unique within the scope (i.e. a document, a set of related documents, or a repository) of its use. + + + + + The potentially ambiguous common identifier, such as a human-readable name for the instance. + + + + + + The complete set of Contacts (people and organisations) for this file. + + + + + + + + + The provider of the document in terms of the Contact and the software the produced the document instance. + + + + + + + The Contact that provided the document instance. + + + + + + The Software that produced the document instance. + + + + + + + + Helper type to allow multiple cvParams or userParams to be given for an element. + + + + + + Helper type to allow either a cvParam or a userParam to be provided for an element. + + + + + + A choice of either a cvParam or userParam. + + + + + + + \ No newline at end of file