Skip to content

Commit

Permalink
Merge pull request #3521 from IQSS/4.6-tweaks-for-harvesting-from-par…
Browse files Browse the repository at this point in the history
…tners

4.6 tweaks for harvesting from partners
  • Loading branch information
kcondon authored Dec 9, 2016
2 parents bace17e + f684e64 commit 5766a6e
Show file tree
Hide file tree
Showing 6 changed files with 278 additions and 65 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ public class ImportDDIServiceBean {
public static final String NAMING_PROTOCOL_DOI = "doi";
public static final String AGENCY_HANDLE = "handle";
public static final String AGENCY_DOI = "DOI";
public static final String AGENCY_DARA = "dara"; // da|ra - http://www.da-ra.de/en/home/
public static final String REPLICATION_FOR_TYPE = "replicationFor";
public static final String VAR_WEIGHTED = "wgtd";
public static final String VAR_INTERVAL_CONTIN = "contin";
Expand Down Expand Up @@ -91,6 +92,7 @@ public class ImportDDIServiceBean {
public static final String NOTE_SUBJECT_LOCKSS_PERM = "LOCKSS Permission";

public static final String NOTE_TYPE_REPLICATION_FOR = "DVN:REPLICATION_FOR";
private static final String HARVESTED_FILE_STORAGE_PREFIX = "http://";
private XMLInputFactory xmlInputFactory = null;

@EJB CustomFieldServiceBean customFieldService;
Expand Down Expand Up @@ -241,18 +243,28 @@ private void processCodeBook(ImportType importType, XMLStreamReader xmlr, Datase
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("docDscr")) {
processDocDscr(xmlr, datasetDTO);
}
else if (xmlr.getLocalName().equals("stdyDscr")) {
} else if (xmlr.getLocalName().equals("stdyDscr")) {
processStdyDscr(importType, xmlr, datasetDTO);
}
else if (xmlr.getLocalName().equals("fileDscr") && !isMigrationImport(importType)) {
} else if (xmlr.getLocalName().equals("otherMat") && (isNewImport(importType) || isHarvestWithFilesImport(importType)) ) {
processOtherMat(xmlr, datasetDTO, filesMap);
} else if (xmlr.getLocalName().equals("fileDscr") && isHarvestWithFilesImport(importType)) {
// If this is a harvesting import, we'll attempt to extract some minimal
// file-level metadata information from the fileDscr sections as well.
// TODO: add more info here... -- 4.6
processFileDscrMinimal(xmlr, datasetDTO, filesMap);
} else if (xmlr.getLocalName().equals("fileDscr") && isNewImport(importType)) {
// this is a "full" fileDscr section - Dataverses use it
// to encode *tabular* files only. It will contain the information
// about variables, observations, etc. It will be complemented
// by a number of <var> entries in the dataDscr section.
// Dataverses do not use this section for harvesting exports, since
// we don't harvest tabular metadata. And all the "regular"
// file-level metadata is encoded in otherMat sections.
// The goal is to one day be able to import such tabular
// metadata using the direct (non-harvesting) import API.
// EMK TODO: add this back in for ImportType.NEW
//processFileDscr(xmlr, datasetDTO, filesMap);

}
else if (xmlr.getLocalName().equals("otherMat") && (isNewImport(importType) || isHarvestWithFilesImport(importType)) ) {
processOtherMat(xmlr, datasetDTO, filesMap);
}
}

} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("codeBook")) return;
Expand Down Expand Up @@ -432,12 +444,23 @@ else if (xmlr.getLocalName().equals("relStdy")) {
private void processCitation(ImportType importType, XMLStreamReader xmlr, DatasetDTO datasetDTO) throws XMLStreamException, ImportException {
DatasetVersionDTO dvDTO = datasetDTO.getDatasetVersion();
MetadataBlockDTO citation=datasetDTO.getDatasetVersion().getMetadataBlocks().get("citation");
boolean distStatementProcessed = false;
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("titlStmt")) processTitlStmt(xmlr, datasetDTO);
else if (xmlr.getLocalName().equals("rspStmt")) processRspStmt(xmlr,citation);
else if (xmlr.getLocalName().equals("prodStmt")) processProdStmt(xmlr,citation);
else if (xmlr.getLocalName().equals("distStmt")) processDistStmt(xmlr,citation);
else if (xmlr.getLocalName().equals("distStmt")) {
if (distStatementProcessed) {
// We've already encountered one Distribution Statement in
// this citation, we'll just skip any consecutive ones.
// This is a defensive check against duplicate distStmt
// in some DDIs (notably, from ICPSR)
} else {
processDistStmt(xmlr,citation);
distStatementProcessed = true;
}
}
else if (xmlr.getLocalName().equals("serStmt")) processSerStmt(xmlr,citation);
else if (xmlr.getLocalName().equals("verStmt")) processVerStmt(importType, xmlr,dvDTO);
else if (xmlr.getLocalName().equals("notes")) {
Expand Down Expand Up @@ -882,11 +905,23 @@ private void processAnlyInfo(XMLStreamReader xmlr, MetadataBlockDTO socialScienc

private void processDataColl(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) throws XMLStreamException {
MetadataBlockDTO socialScience =getSocialScience(dvDTO);

String collMode = "";
String timeMeth = "";
String weight = "";

for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
//timeMethod
if (xmlr.getLocalName().equals("timeMeth")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("timeMethod", parseText( xmlr, "timeMeth" )));
String thisValue = parseText( xmlr, "timeMeth" );
if (!StringUtil.isEmpty(thisValue)) {
if (!"".equals(timeMeth)) {
timeMeth = timeMeth.concat(", ");
}
timeMeth = timeMeth.concat(thisValue);
}
//socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("timeMethod", parseText( xmlr, "timeMeth" )));
} else if (xmlr.getLocalName().equals("dataCollector")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("dataCollector", parseText( xmlr, "dataCollector" )));
// frequencyOfDataCollection
Expand All @@ -903,7 +938,14 @@ private void processDataColl(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) thro
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("deviationsFromSampleDesign", parseText( xmlr, "deviat" )));
// collectionMode
} else if (xmlr.getLocalName().equals("collMode")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("collectionMode", parseText( xmlr, "collMode" )));
String thisValue = parseText( xmlr, "collMode" );
if (!StringUtil.isEmpty(thisValue)) {
if (!"".equals(collMode)) {
collMode = collMode.concat(", ");
}
collMode = collMode.concat(thisValue);
}
//socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("collectionMode", parseText( xmlr, "collMode" )));
//researchInstrument
} else if (xmlr.getLocalName().equals("resInstru")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("researchInstrument", parseText( xmlr, "resInstru" )));
Expand All @@ -916,12 +958,30 @@ private void processDataColl(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) thro
} else if (xmlr.getLocalName().equals("ConOps")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("controlOperations", parseText( xmlr, "ConOps" )));
} else if (xmlr.getLocalName().equals("weight")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("weighting", parseText( xmlr, "weight" )));
String thisValue = parseText( xmlr, "weight" );
if (!StringUtil.isEmpty(thisValue)) {
if (!"".equals(weight)) {
weight = weight.concat(", ");
}
weight = weight.concat(thisValue);
}
//socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("weighting", parseText( xmlr, "weight" )));
} else if (xmlr.getLocalName().equals("cleanOps")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("cleaningOperations", parseText( xmlr, "cleanOps" )));
}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("dataColl")) return;
if (xmlr.getLocalName().equals("dataColl")) {
if (!StringUtil.isEmpty(timeMeth)) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("timeMethod", timeMeth));
}
if (!StringUtil.isEmpty(collMode)) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("collectionMode", collMode));
}
if (!StringUtil.isEmpty(weight)) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("weighting", weight));
}
return;
}
}
}
}
Expand Down Expand Up @@ -1242,6 +1302,16 @@ private void processTitlStmt(XMLStreamReader xmlr, DatasetDTO datasetDTO) throws
parseStudyIdHandle( parseText(xmlr), datasetDTO );
} else if ( AGENCY_DOI.equals( xmlr.getAttributeValue(null, "agency") ) ) {
parseStudyIdDOI( parseText(xmlr), datasetDTO );
} else if ( AGENCY_DARA.equals( xmlr.getAttributeValue(null, "agency"))) {
/*
da|ra - "Registration agency for social and economic data"
(http://www.da-ra.de/en/home/)
ICPSR uses da|ra to register their DOIs; so they have agency="dara"
in their IDNo entries.
Also, their DOIs are formatted differently, without the
hdl: prefix.
*/
parseStudyIdDoiICPSRdara( parseText(xmlr), datasetDTO );
} else {
HashSet<FieldDTO> set = new HashSet<>();
addToSet(set,"otherIdAgency", xmlr.getAttributeValue(null, "agency"));
Expand Down Expand Up @@ -1325,16 +1395,23 @@ private Object parseTextNew(XMLStreamReader xmlr, String endTag) throws XMLStrea
if (event == XMLStreamConstants.CHARACTERS) {
returnString += xmlr.getText().trim().replace('\n',' ');
} else if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("p")) {
returnString += "<p>" + parseText(xmlr, "p") + "</p>";
} else if (xmlr.getLocalName().equals("emph")) {
returnString += "<em>" + parseText(xmlr, "emph") + "</em>";
} else if (xmlr.getLocalName().equals("hi")) {
returnString += "<strong>" + parseText(xmlr, "hi") + "</strong>";
if (xmlr.getLocalName().equals("p") || xmlr.getLocalName().equals("br") || xmlr.getLocalName().equals("head")) {
returnString += "<p>" + parseText(xmlr, xmlr.getLocalName()) + "</p>";
} else if (xmlr.getLocalName().equals("emph") || xmlr.getLocalName().equals("em") || xmlr.getLocalName().equals("i")) {
returnString += "<em>" + parseText(xmlr, xmlr.getLocalName()) + "</em>";
} else if (xmlr.getLocalName().equals("hi") || xmlr.getLocalName().equals("b")) {
returnString += "<strong>" + parseText(xmlr, xmlr.getLocalName()) + "</strong>";
} else if (xmlr.getLocalName().equals("ExtLink")) {
String uri = xmlr.getAttributeValue(null, "URI");
String text = parseText(xmlr, "ExtLink").trim();
returnString += "<a href=\"" + uri + "\">" + ( StringUtil.isEmpty(text) ? uri : text) + "</a>";
} else if (xmlr.getLocalName().equals("a") || xmlr.getLocalName().equals("A")) {
String uri = xmlr.getAttributeValue(null, "URI");
if (StringUtil.isEmpty(uri)) {
uri = xmlr.getAttributeValue(null, "HREF");
}
String text = parseText(xmlr, xmlr.getLocalName()).trim();
returnString += "<a href=\"" + uri + "\">" + ( StringUtil.isEmpty(text) ? uri : text) + "</a>";
} else if (xmlr.getLocalName().equals("list")) {
returnString += parseText_list(xmlr);
} else if (xmlr.getLocalName().equals("citation")) {
Expand All @@ -1343,6 +1420,8 @@ private Object parseTextNew(XMLStreamReader xmlr, String endTag) throws XMLStrea
} else {
returnString += parseText_citation(xmlr);
}
} else if (xmlr.getLocalName().equals("txt")) {
returnString += parseText(xmlr);
} else {
throw new EJBException("ERROR occurred in mapDDI (parseText): tag not yet supported: <" + xmlr.getLocalName() + ">" );
}
Expand Down Expand Up @@ -1373,7 +1452,7 @@ private String parseText_list (XMLStreamReader xmlr) throws XMLStreamException {

// check type
String listType = xmlr.getAttributeValue(null, "type");
if ("bulleted".equals(listType) ){
if ("bulleted".equals(listType) || listType == null){
listString = "<ul>\n";
listCloseTag = "</ul>";
} else if ("ordered".equals(listType) ) {
Expand Down Expand Up @@ -1524,6 +1603,31 @@ private void parseStudyIdDOI(String _id, DatasetDTO datasetDTO) throws ImportExc

datasetDTO.setIdentifier(_id.substring(index2+1));
}

private void parseStudyIdDoiICPSRdara(String _id, DatasetDTO datasetDTO) throws ImportException{
/*
dara/ICPSR DOIs are formatted without the hdl: prefix; for example -
10.3886/ICPSR06635.v1
so we assume that everything before the last "/" is the authority,
and everything past it - the identifier:
*/

int index = _id.lastIndexOf('/');

if (index == -1) {
throw new ImportException("Error parsing ICPSR/dara DOI IdNo: "+_id+". '/' not found in string");
}

if (index == _id.length() - 1) {
throw new ImportException("Error parsing ICPSR/dara DOI IdNo: "+_id+" ends with '/'");
}

datasetDTO.setAuthority(_id.substring(0, index));
datasetDTO.setProtocol("doi");
datasetDTO.setDoiSeparator("/");

datasetDTO.setIdentifier(_id.substring(index+1));
}
// Helper methods
private MetadataBlockDTO getCitation(DatasetVersionDTO dvDTO) {
return dvDTO.getMetadataBlocks().get("citation");
Expand Down Expand Up @@ -1609,6 +1713,58 @@ private void processOtherMat(XMLStreamReader xmlr, DatasetDTO datasetDTO, Map fi
}
}

// this method is for attempting to extract the minimal amount of file-level
// metadata from an ICPSR-supplied DDI. (they use the "fileDscr" instead of
// "otherMat" for general file metadata; the only field they populate is
// "fileName". -- 4.6

private void processFileDscrMinimal(XMLStreamReader xmlr, DatasetDTO datasetDTO, Map filesMap) throws XMLStreamException {
FileMetadataDTO fmdDTO = new FileMetadataDTO();

if (datasetDTO.getDatasetVersion().getFileMetadatas() == null) {
datasetDTO.getDatasetVersion().setFileMetadatas(new ArrayList<>());
}
datasetDTO.getDatasetVersion().getFileMetadatas().add(fmdDTO);

DataFileDTO dfDTO = new DataFileDTO();
dfDTO.setContentType("data/various-formats"); // reserved ICPSR content type identifier
fmdDTO.setDataFile(dfDTO);

for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("fileName")) {
// this is the file name:
String label = parseText(xmlr);
// do some cleanup:
int col = label.lastIndexOf(':');
if ( col > -1) {
if (col < label.length() - 1) {
label = label.substring(col+1);
} else {
label = label.replaceAll(":", "");
}
}
label = label.replaceAll("[#;<>\\?\\|\\*\"]", "");
label = label.replaceAll("/", "-");
// strip leading blanks:
label = label.replaceFirst("^[ \t]*", "");
fmdDTO.setLabel(label);
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("fileDscr")) {
if (fmdDTO.getLabel() == null || fmdDTO.getLabel().trim().equals("") ) {
fmdDTO.setLabel("harvested file");
}
if (StringUtil.isEmpty(fmdDTO.getDataFile().getStorageIdentifier())) {
fmdDTO.getDataFile().setStorageIdentifier(HARVESTED_FILE_STORAGE_PREFIX);
}

return;
}
}
}
}

private void processFileDscr(XMLStreamReader xmlr, DatasetDTO datasetDTO, Map filesMap) throws XMLStreamException {
FileMetadataDTO fmdDTO = new FileMetadataDTO();

Expand Down
Loading

0 comments on commit 5766a6e

Please sign in to comment.