#39 The supplied Input unnecessarily is not written into memory

This commit is contained in:
Andreas Penski 2019-12-18 15:57:44 +01:00
parent d7ee019194
commit efd4fd5fff
63 changed files with 1111 additions and 18196 deletions

View file

@ -81,8 +81,8 @@ public class DefaultCheck implements Check {
this.repository = new ScenarioRepository(this.contentRepository);
this.repository.initialize(configuration);
this.checkSteps = new ArrayList<>();
this.checkSteps.add(new CreateDocumentIdentificationAction());
this.checkSteps.add(new DocumentParseAction());
this.checkSteps.add(new CreateDocumentIdentificationAction());
this.checkSteps.add(new ScenarioSelectionAction(this.repository));
this.checkSteps.add(new SchemaValidationAction());
this.checkSteps.add(new SchematronValidationAction(this.contentRepository, this.conversionService));

View file

@ -0,0 +1,57 @@
package de.kosit.validationtool.impl.input;
import java.io.InputStream;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import de.kosit.validationtool.api.Input;
/**
* Base class for all {@link Input Inputs}.
*
* @author Andreas Penski
*/
@Slf4j
public abstract class AbstractInput implements Input, LazyReadInput {
private byte[] hashCode;
@Getter
@Setter
private long length;
@Override
public byte[] getHashCode() {
if (this.hashCode == null) {
throw new IllegalStateException("Hashcode is not computed yet");
}
return this.hashCode;
}
protected InputStream wrap(final InputStream stream) {
InputStream result = stream;
if (!isHashcodeComputed()) {
result = StreamHelper.wrapDigesting(this, result, getDigestAlgorithm());
}
if (getLength() == 0) {
result = StreamHelper.wrapCount(this, result);
}
return result;
}
@Override
public boolean isHashcodeComputed() {
return this.hashCode != null;
}
@Override
public void setHashCode(final byte[] digest) {
this.hashCode = digest;
}
public boolean supportsMultipleReads() {
return true;
}
}

View file

@ -0,0 +1,39 @@
package de.kosit.validationtool.impl.input;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import javax.xml.transform.Source;
import javax.xml.transform.stream.StreamSource;
import lombok.AllArgsConstructor;
import lombok.Getter;
/**
* Classical in-memory {@link de.kosit.validationtool.api.Input}. It is not memory efficient to read the whole file into
* memory prio validating. Consider using the {@link ResourceInput}.
*
* @author Andreas Penski
*/
@Getter
@AllArgsConstructor
public class ByteArrayInput extends AbstractInput {
private final byte[] content;
private final String name;
private final String digestAlgorithm;
@Override
public long getLength() {
return this.content != null ? this.content.length : 0;
}
@Override
public Source getSource() {
final InputStream stream = wrap(new ByteArrayInputStream(this.content));
return new StreamSource(stream, getName());
}
}

View file

@ -0,0 +1,36 @@
package de.kosit.validationtool.impl.input;
import java.io.InputStream;
import de.kosit.validationtool.api.Input;
/**
* Internal interface used for lazy generation of the hashcode for document identification.
*
* @see StreamHelper#wrapDigesting(LazyReadInput, InputStream, String) for details
* @author Andreas Penski
*/
interface LazyReadInput {
/**
* Sets a hashcode
*
* @param digest the digest
*/
void setHashCode(byte[] digest);
/**
* Determines whether a hashcode has been computed yet
*
* @return true when computed
*/
boolean isHashcodeComputed();
/**
* Setting the length of the {@link Input}.
*
* @param length the length
*/
void setLength(long length);
}

View file

@ -0,0 +1,43 @@
package de.kosit.validationtool.impl.input;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import javax.xml.transform.Source;
import javax.xml.transform.stream.StreamSource;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import de.kosit.validationtool.api.Input;
/**
* An {@link Input} carries an {@link URL} which can be used for all 'locatable' inputs such as {@link File},
* {@link java.nio.file.Path} and any other {@link URL}.
*
* This stream is NOT read into memory. So this implementation has good in memory efficieny. The validation process MAY
* read the stream more than once. Make sure, that the {@link URL} points to fast I/O devices
*
* @author Andreas Penski
*/
@Getter
@RequiredArgsConstructor
public class ResourceInput extends AbstractInput {
private final URL url;
private final String name;
private final String digestAlgorithm;
@Override
public Source getSource() throws IOException {
InputStream stream = this.url.openStream();
if (!isHashcodeComputed()) {
stream = StreamHelper.wrapDigesting(this, stream, getDigestAlgorithm());
}
return new StreamSource(stream, this.name);
}
}

View file

@ -0,0 +1,108 @@
package de.kosit.validationtool.impl.input;
import java.io.IOException;
import java.nio.charset.Charset;
import javax.xml.transform.Source;
import javax.xml.transform.stream.StreamSource;
import org.apache.commons.io.input.ReaderInputStream;
import org.apache.commons.lang3.NotImplementedException;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
/**
* A validator {@link de.kosit.validationtool.api.Input} based an on a {@link Source}.
*
* @author Andreas Penski
*/
@Getter
@Slf4j
public class SourceInput extends AbstractInput {
private final Source source;
private final String name;
private final String digestAlgorithm;
public SourceInput(final StreamSource source, final String name, final String digestAlgorithm) {
this(source, name, digestAlgorithm, null);
}
public SourceInput(final Source source, final String name, final String digestAlgorithm, final byte[] hashCode) {
this.source = source;
this.name = name;
this.digestAlgorithm = digestAlgorithm;
setHashCode(hashCode);
validate();
}
private void validate() {
if (!isSupported()) {
throw new IllegalStateException("Unsupported source. Only StreamSource supported yet");
}
if (((StreamSource) this.source).getInputStream() == null && !isHashcodeComputed()) {
log.warn("No hashcode supplied, will wrap the reader using system default charset");
}
}
@Override
public Source getSource() throws IOException {
if (!isSupported()) {
throw new IllegalStateException("Unsupported source. Only InputStream-based StreamSource supported yet");
}
if (isWrappingRequired()) {
return wrap();
}
if (isConsumed()) {
throw new IllegalStateException("A StreamSource can only read once");
}
return this.source;
}
private boolean isSupported() {
return isStreamSource();
}
private boolean isConsumed() throws IOException {
if (!isStreamSource()) {
throw new NotImplementedException("Supports only StreamSource yet");
}
final StreamSource ss = (StreamSource) this.source;
try {
return (ss.getInputStream() != null && ss.getInputStream().available() == 0)
|| (ss.getReader() != null && !ss.getReader().ready());
} catch (final IOException e) {
return true;
}
}
private boolean isStreamSource() {
return this.source instanceof StreamSource;
}
private Source wrap() {
Source result = this.source;
if (isStreamSource()) {
final StreamSource ss = (StreamSource) this.source;
if (ss.getInputStream() != null) {
result = new StreamSource(wrap(ss.getInputStream()), this.source.getSystemId());
} else if (ss.getReader() != null) {
result = new StreamSource(wrap(new ReaderInputStream(ss.getReader(), Charset.defaultCharset())), this.source.getSystemId());
}
}
return result;
}
private boolean isWrappingRequired() {
return !isHashcodeComputed();
}
@Override
public boolean supportsMultipleReads() {
return false;
}
}

View file

@ -0,0 +1,95 @@
package de.kosit.validationtool.impl.input;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import org.apache.commons.io.input.CountingInputStream;
/**
* Helper for stream handling.
*
* @author Andreas Penski
*/
public class StreamHelper {
/**
* Helper class, which generates the hashcode while reading the stream e.g. for parsing the document. This allows
* generating the hashcode without an aditional reading step.
*/
private static class DigestingInputStream extends FilterInputStream {
private final MessageDigest digest;
private final LazyReadInput reference;
DigestingInputStream(final LazyReadInput input, final InputStream in, final MessageDigest digest) {
super(new DigestInputStream(in, digest));
this.digest = digest;
this.reference = input;
}
@Override
public void close() throws IOException {
super.close();
this.reference.setHashCode(this.digest.digest());
}
}
private static class CountInputStream extends FilterInputStream {
private final LazyReadInput reference;
public CountInputStream(final LazyReadInput input, final InputStream stream) {
super(new org.apache.commons.io.input.CountingInputStream(stream));
this.reference = input;
}
@Override
public void close() throws IOException {
super.close();
this.reference.setLength(((CountingInputStream) this.in).getByteCount());
}
}
private StreamHelper() {
// hide
}
public static MessageDigest createDigest(final String algorithm) {
try {
final MessageDigest digest;
digest = MessageDigest.getInstance(algorithm);
return digest;
} catch (final NoSuchAlgorithmException e) {
// should not happen
throw new IllegalArgumentException(String.format("Specified method %s is not available", algorithm), e);
}
}
/**
* Wraps the {@link InputStream} with a counting length implementation.
*
* @param input the {@link LazyReadInput input}
* @param stream the stream
* @return a wrapped stream
*/
public static InputStream wrapCount(final LazyReadInput input, final InputStream stream) {
return new CountInputStream(input, stream);
}
/**
* Wraps the {@link InputStream} with an implementation the generates a hash sum over the stream data.
*
* @param input the {@link LazyReadInput input}
* @param stream the stream
* @return a wrapped stream
*/
public static InputStream wrapDigesting(final LazyReadInput input, final InputStream stream, final String digestAlgorithm) {
return new DigestingInputStream(input, stream, createDigest(digestAlgorithm));
}
}

View file

@ -19,14 +19,10 @@
package de.kosit.validationtool.impl.tasks;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.stream.Collectors;
import javax.xml.transform.stream.StreamSource;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@ -63,10 +59,10 @@ public class DocumentParseAction implements CheckAction {
throw new IllegalArgumentException("Input may not be null");
}
Result<XdmNode, XMLSyntaxError> result;
try ( final InputStream input = new ByteArrayInputStream(content.getContent()) ) {
try {
final DocumentBuilder builder = ObjectFactory.createProcessor().newDocumentBuilder();
builder.setLineNumbering(true);
final XdmNode doc = builder.build(new StreamSource(input));
final XdmNode doc = builder.build(content.getSource());
result = new Result<>(doc, Collections.emptyList());
} catch (final SaxonApiException | IOException e) {
log.debug("Exception while parsing {}", content.getName(), e);

View file

@ -20,67 +20,201 @@
package de.kosit.validationtool.impl.tasks;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import javax.xml.transform.Source;
import javax.xml.transform.stream.StreamSource;
import javax.xml.validation.Validator;
import org.apache.commons.io.FileUtils;
import org.xml.sax.SAXException;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import de.kosit.validationtool.api.Input;
import de.kosit.validationtool.impl.CollectingErrorEventHandler;
import de.kosit.validationtool.impl.ObjectFactory;
import de.kosit.validationtool.impl.input.AbstractInput;
import de.kosit.validationtool.impl.model.Result;
import de.kosit.validationtool.model.reportInput.CreateReportInput;
import de.kosit.validationtool.model.reportInput.ValidationResultsXmlSchema;
import de.kosit.validationtool.model.reportInput.XMLSyntaxError;
import de.kosit.validationtool.model.scenarios.ScenarioType;
import net.sf.saxon.s9api.SaxonApiException;
import net.sf.saxon.s9api.Serializer;
import net.sf.saxon.s9api.XdmNode;
/**
* Schema-Validierung der Eingabe-Datei mittels Schema-Definition aus dem identifizierten Szenario.
* Schema valiation of the {@link Input} with the schema of the supplied scenario. This implementation is based on JDK
* functionality and therefore needs a {@link Source} to do the actual validation. Since we base the validator on Saxon
* HE functionality, we have no support for schema in Saxon (e.g. the in memory version of the document is not
* schema-aware) and need to re-read the actual source.
*
* Since the actual {@link Input} implementation may not be read twice, we must serialize the previously read document.
* This implementation tries to do the validation in an efficient manner. If possible the source is read a second time
* to validate. If not, the source is serialized to the heap upon re-read/validaiton up to a configurable file size. The
* document is serialized to a temporary file otherwise.
*
* @author Andreas Penski
*/
@Slf4j
public class SchemaValidationAction implements CheckAction {
private Result<Boolean, XMLSyntaxError> validate(byte[] document, ScenarioType scenarioType) {
private static class ByteArraySerializedDocument implements SerializedDocument {
private byte[] bytes;
@Override
public void serialize(final XdmNode node) throws SaxonApiException, IOException {
try ( final ByteArrayOutputStream out = new ByteArrayOutputStream() ) {
final Serializer serializer = ObjectFactory.createProcessor().newSerializer();
serializer.setOutputStream(out);
serializer.serializeNode(node);
serializer.close();
this.bytes = out.toByteArray();
}
}
@Override
public void close() {
// nothing do do
}
@Override
public InputStream openStream() {
return new ByteArrayInputStream(this.bytes);
}
}
private static class FileSerializedDocument implements SerializedDocument {
private final Path file;
FileSerializedDocument() throws IOException {
this.file = Files.createTempFile("validator", ".xml");
}
@Override
public void serialize(final XdmNode node) throws SaxonApiException, IOException {
try ( final OutputStream out = Files.newOutputStream(this.file) ) {
final Serializer serializer = ObjectFactory.createProcessor().newSerializer();
serializer.setOutputStream(out);
serializer.serializeNode(node);
serializer.close();
}
}
@Override
public void close() throws IOException {
Files.deleteIfExists(this.file);
}
@Override
public InputStream openStream() throws IOException {
return Files.newInputStream(this.file);
}
}
private static final Long BA_LIMIT = 10L;
private static final String LIMIT_PARAMETER = "schema.validation.inmem.limit";
@Setter(AccessLevel.PACKAGE)
@Getter
private long inMemoryLimit = Long.parseLong(System.getProperty(LIMIT_PARAMETER, BA_LIMIT.toString())) * FileUtils.ONE_MB;
private Result<Boolean, XMLSyntaxError> validate(final Bag results, final ScenarioType scenarioType) {
log.debug("Validating document using scenario {}", scenarioType.getName());
final CollectingErrorEventHandler errorHandler = new CollectingErrorEventHandler();
try ( InputStream input = new ByteArrayInputStream(document) ) {
try ( final SourceProvider validateInput = resolveSource(results) ) {
final Validator validator = ObjectFactory.createValidator(scenarioType.getSchema());
validator.setErrorHandler(errorHandler);
validator.validate(new StreamSource(input));
validator.validate(validateInput.getSource());
return new Result<>(!errorHandler.hasErrors(), errorHandler.getErrors());
} catch (SAXException | IOException e) {
} catch (final SAXException | SaxonApiException | IOException e) {
throw new IllegalStateException("Error validating document", e);
}
}
@Override
public void check(Bag results) {
public void check(final Bag results) {
final CreateReportInput report = results.getReportInput();
final ScenarioType scenario = results.getScenarioSelectionResult().getObject();
final Result<Boolean, XMLSyntaxError> validateResult = validate(results.getInput().getContent(), scenario);
final Result<Boolean, XMLSyntaxError> validateResult = validate(results, scenario);
results.setSchemaValidationResult(validateResult);
ValidationResultsXmlSchema result = new ValidationResultsXmlSchema();
final ValidationResultsXmlSchema result = new ValidationResultsXmlSchema();
report.setValidationResultsXmlSchema(result);
result.getResource().addAll(scenario.getValidateWithXmlSchema().getResource());
if (!validateResult.isValid()) {
result.getXmlSyntaxError().addAll(validateResult.getErrors());
}
}
private SourceProvider resolveSource(final Bag results) throws IOException, SaxonApiException {
final SourceProvider source;
if (results.getInput() instanceof AbstractInput && (((AbstractInput) results.getInput()).supportsMultipleReads())) {
source = () -> results.getInput().getSource();
} else {
source = serialize(results.getInput(), results.getParserResult().getObject());
}
return source;
}
private SerializedDocument serialize(final Input input, final XdmNode object) throws IOException, SaxonApiException {
final SerializedDocument doc;
if (input instanceof AbstractInput && ((AbstractInput) input).getLength() < getInMemoryLimit()) {
doc = new ByteArraySerializedDocument();
} else {
doc = new FileSerializedDocument();
}
doc.serialize(object);
return doc;
}
@Override
public boolean isSkipped(Bag results) {
public boolean isSkipped(final Bag results) {
return hasNoScenario(results);
}
private static boolean hasNoScenario(Bag results) {
private static boolean hasNoScenario(final Bag results) {
return results.getScenarioSelectionResult() == null || results.getScenarioSelectionResult().isInvalid();
}
private interface SourceProvider extends AutoCloseable {
Source getSource() throws IOException;
@Override
default void close() throws IOException {
// nothing
}
}
private interface SerializedDocument extends AutoCloseable, SourceProvider {
void serialize(XdmNode node) throws SaxonApiException, IOException;
InputStream openStream() throws IOException;
@Override
default Source getSource() throws IOException {
return new StreamSource(openStream());
}
}
}