Skip to content

Commit e298f04

Browse files
committed
MLE-27509 No longer using ByteArrayDataSource on a multipart response
This should be a no-op for existing tests, given that we don't have any tests that read a document large enough to cause an out-of-memory error. ArchUnit is used to verify that ByteArrayDataSource is not used except in the one place where it's allowable.
1 parent 46d1b40 commit e298f04

4 files changed

Lines changed: 113 additions & 7 deletions

File tree

marklogic-client-api/build.gradle

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ dependencies {
7676

7777
testImplementation 'org.skyscreamer:jsonassert:1.5.3'
7878

79+
// ArchUnit for verifying certain classes aren't used
80+
testImplementation 'com.tngtech.archunit:archunit-junit5:1.4.1'
81+
7982
// Automatic loading of test framework implementation dependencies is deprecated.
8083
// https://docs.gradle.org/current/userguide/upgrading_version_8.html#test_framework_implementation_dependencies
8184
// Without this, once using JUnit 5.12 or higher, Gradle will not find any tests and report an error of:

marklogic-client-api/src/main/java/com/marklogic/client/impl/OkHttpServices.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@
3737
import jakarta.mail.internet.ContentDisposition;
3838
import jakarta.mail.internet.MimeMultipart;
3939
import jakarta.mail.internet.ParseException;
40-
import jakarta.mail.util.ByteArrayDataSource;
4140
import jakarta.xml.bind.DatatypeConverter;
41+
import com.marklogic.client.impl.okhttp.InputStreamDataSource;
4242
import okhttp3.*;
4343
import okhttp3.MultipartBody.Part;
4444
import okhttp3.logging.HttpLoggingInterceptor;
@@ -5305,7 +5305,9 @@ static private <T> T getEntity(ResponseBody body, Class<T> as) {
53055305
} else if (as == MimeMultipart.class) {
53065306
MediaType mediaType = body.contentType();
53075307
String contentType = (mediaType != null) ? mediaType.toString() : "application/x-unknown-content-type";
5308-
ByteArrayDataSource dataSource = new ByteArrayDataSource(body.byteStream(), contentType);
5308+
// Use custom DataSource to avoid reading document into memory. Allows a user to use an
5309+
// InputStreamHandle to fetch the content without being surprised that all the data is in memory already.
5310+
InputStreamDataSource dataSource = new InputStreamDataSource(body.byteStream(), contentType);
53095311
return (T) new MimeMultipart(dataSource);
53105312
} else if (as == File.class) {
53115313
// write out the response body to a temp file in the system temp folder
@@ -6055,12 +6057,10 @@ void setResponse(Response response) {
60556057
setNull(true);
60566058
return;
60576059
}
6058-
ByteArrayDataSource dataSource = new ByteArrayDataSource(
6059-
responseBody.byteStream(), contentType.toString()
6060-
);
6060+
// Use custom DataSource to avoid reading document into memory. Allows a user to use an
6061+
// InputStreamHandle to fetch the content without being surprised that all the data is in memory already.
6062+
InputStreamDataSource dataSource = new InputStreamDataSource(responseBody.byteStream(), contentType.toString());
60616063
setMultipart(new MimeMultipart(dataSource));
6062-
} catch (IOException e) {
6063-
throw new MarkLogicIOException(e);
60646064
} catch (MessagingException e) {
60656065
throw new MarkLogicIOException(e);
60666066
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/*
2+
* Copyright (c) 2010-2026 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
3+
*/
4+
package com.marklogic.client.impl.okhttp;
5+
6+
import jakarta.activation.DataSource;
7+
8+
import java.io.InputStream;
9+
import java.io.OutputStream;
10+
11+
/**
12+
* A streaming DataSource implementation that wraps an InputStream without buffering it into memory.
13+
* This is a critical component for enabling true streaming of large documents from MarkLogic,
14+
* avoiding OutOfMemoryErrors when processing large result sets.
15+
* <p>
16+
* Unlike ByteArrayDataSource (which loads the entire stream into a byte array), this implementation
17+
* preserves the streaming nature of the underlying InputStream, allowing documents to be processed
18+
* incrementally as they are read from the network.
19+
* <p>
20+
* Note: This DataSource is read-only. The getOutputStream() method throws UnsupportedOperationException.
21+
*/
22+
public class InputStreamDataSource implements DataSource {
23+
24+
private final InputStream inputStream;
25+
private final String contentType;
26+
27+
/**
28+
* Creates a new InputStreamDataSource.
29+
*
30+
* @param inputStream the InputStream to wrap (will not be buffered into memory)
31+
* @param contentType the MIME type of the data
32+
*/
33+
public InputStreamDataSource(InputStream inputStream, String contentType) {
34+
this.inputStream = inputStream;
35+
this.contentType = contentType;
36+
}
37+
38+
@Override
39+
public InputStream getInputStream() {
40+
return inputStream;
41+
}
42+
43+
@Override
44+
public OutputStream getOutputStream() {
45+
throw new UnsupportedOperationException("InputStreamDataSource is read-only");
46+
}
47+
48+
@Override
49+
public String getContentType() {
50+
return contentType;
51+
}
52+
53+
@Override
54+
public String getName() {
55+
return null;
56+
}
57+
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* Copyright (c) 2010-2026 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
3+
*/
4+
package com.marklogic.client.test;
5+
6+
import com.marklogic.client.impl.okhttp.PartIterator;
7+
import com.tngtech.archunit.core.domain.JavaClasses;
8+
import com.tngtech.archunit.core.importer.ClassFileImporter;
9+
import com.tngtech.archunit.core.importer.ImportOption;
10+
import com.tngtech.archunit.lang.ArchRule;
11+
import jakarta.mail.util.ByteArrayDataSource;
12+
import org.junit.jupiter.api.Test;
13+
14+
import static com.tngtech.archunit.lang.syntax.ArchRuleDefinition.noClasses;
15+
16+
/**
17+
* Architecture tests to ensure streaming best practices are followed.
18+
* <p>
19+
* ByteArrayDataSource defeats streaming by loading entire InputStreams into memory.
20+
* We use InputStreamDataSource instead to enable true streaming for large documents.
21+
*/
22+
class VerifyByteArrayDataSourceIsNotUsedTest {
23+
24+
private final JavaClasses classes = new ClassFileImporter()
25+
.withImportOption(ImportOption.Predefined.DO_NOT_INCLUDE_TESTS)
26+
.importPackages("com.marklogic.client");
27+
28+
@Test
29+
void shouldNotUseByteArrayDataSourceInProduction() {
30+
ArchRule rule = noClasses()
31+
// PartIterator can use ByteArrayDataSource because for an eval/invoke use case, it's very likely the user
32+
// wants the results in memory so they can perform some operation on them. If this proves false in the
33+
// future, PartIterator can be adjusted to use InputStreamDataSource instead.
34+
.that().doNotHaveSimpleName(PartIterator.class.getSimpleName())
35+
36+
.should().dependOnClassesThat().haveSimpleName(ByteArrayDataSource.class.getSimpleName())
37+
.because("MLE-27509 identifies a problem where a multipart response was having each part " +
38+
"processed with the ByteArrayDataSource, which loads the entire stream into memory. This is " +
39+
"surprising to any user using InputStreamHandle to access the content of a document, as that user " +
40+
"is likely expecting to stream document from MarkLogic to some target. The InputStreamDataSource " +
41+
"class was created to avoiding reading the contents of the document into an in-memory byte " +
42+
"array, thus allowing for streaming reads to occur via a multipart response.");
43+
44+
rule.check(classes);
45+
}
46+
}

0 commit comments

Comments
 (0)