Arrow JDBC Adapter

Arrow Java JDBC 模組 會將 JDBC ResultSets 轉換為 Arrow VectorSchemaRoots。

ResultSet 轉 VectorSchemaRoot 範例

協助我們將 ResultSet 轉換為 VectorSchemaRoot 的主要類別是 JdbcToArrow

import org.apache.arrow.adapter.jdbc.ArrowVectorIterator;
import org.apache.arrow.adapter.jdbc.JdbcToArrow;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.ibatis.jdbc.ScriptRunner;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;

try (BufferAllocator allocator = new RootAllocator();
     Connection connection = DriverManager.getConnection(
             "jdbc:h2:mem:h2-jdbc-adapter")) {
    ScriptRunner runnerDDLDML = new ScriptRunner(connection);
    runnerDDLDML.setLogWriter(null);
    runnerDDLDML.runScript(new BufferedReader(
            new FileReader("./thirdpartydeps/jdbc/h2-ddl.sql")));
    runnerDDLDML.runScript(new BufferedReader(
            new FileReader("./thirdpartydeps/jdbc/h2-dml.sql")));
    try (ResultSet resultSet = connection.createStatement().executeQuery(
            "SELECT int_field1, bool_field2, bigint_field5 FROM TABLE1");
         ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator(
                 resultSet, allocator)) {
        while (iterator.hasNext()) {
            try (VectorSchemaRoot root = iterator.next()) {
                System.out.print(root.contentToTSVString());
            }
        }
    }
} catch (SQLException | IOException e) {
    e.printStackTrace();
}
INT_FIELD1    BOOL_FIELD2    BIGINT_FIELD5
101    true    1000000000300
102    true    100000000030
103    true    10000000003

配置陣列子類型

JdbcToArrow 會透過 JdbcToArrowConfig 接受配置。例如,陣列欄中元素的類型可以使用 setArraySubTypeByColumnNameMap 指定。

import org.apache.arrow.adapter.jdbc.ArrowVectorIterator;
import org.apache.arrow.adapter.jdbc.JdbcFieldInfo;
import org.apache.arrow.adapter.jdbc.JdbcToArrow;
import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig;
import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder;
import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.ibatis.jdbc.ScriptRunner;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Types;
import java.util.HashMap;

try (BufferAllocator allocator = new RootAllocator();
     Connection connection = DriverManager.getConnection(
             "jdbc:h2:mem:h2-jdbc-adapter")) {
    ScriptRunner runnerDDLDML = new ScriptRunner(connection);
    runnerDDLDML.setLogWriter(null);
    runnerDDLDML.runScript(new BufferedReader(
            new FileReader("./thirdpartydeps/jdbc/h2-ddl.sql")));
    runnerDDLDML.runScript(new BufferedReader(
            new FileReader("./thirdpartydeps/jdbc/h2-dml.sql")));
    JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator,
            JdbcToArrowUtils.getUtcCalendar())
            .setArraySubTypeByColumnNameMap(
                    new HashMap<>() {{
                        put("LIST_FIELD19",
                                new JdbcFieldInfo(Types.INTEGER));
                    }}
            )
            .build();
    try (ResultSet resultSet = connection.createStatement().executeQuery(
            "SELECT int_field1, bool_field2, bigint_field5, char_field16, list_field19 FROM TABLE1");
         ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator(
                 resultSet, config)) {
        while (iterator.hasNext()) {
            try (VectorSchemaRoot root = iterator.next()) {
                System.out.print(root.contentToTSVString());
            }
        }
    }
} catch (SQLException | IOException e) {
    e.printStackTrace();
}
INT_FIELD1    BOOL_FIELD2    BIGINT_FIELD5    CHAR_FIELD16    LIST_FIELD19
101    true    1000000000300    some char text      [1,2,3]
102    true    100000000030    some char text      [1,2]
103    true    10000000003    some char text      [1]

配置批次大小

預設情況下,此轉接器會在一個批次中讀取多達 1024 列的資料。這可以使用 setTargetBatchSize 自訂。

import org.apache.arrow.adapter.jdbc.ArrowVectorIterator;
import org.apache.arrow.adapter.jdbc.JdbcFieldInfo;
import org.apache.arrow.adapter.jdbc.JdbcToArrow;
import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig;
import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder;
import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.ibatis.jdbc.ScriptRunner;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Types;
import java.util.HashMap;

try (BufferAllocator allocator = new RootAllocator();
     Connection connection = DriverManager.getConnection(
             "jdbc:h2:mem:h2-jdbc-adapter")) {
    ScriptRunner runnerDDLDML = new ScriptRunner(connection);
    runnerDDLDML.setLogWriter(null);
    runnerDDLDML.runScript(new BufferedReader(
            new FileReader("./thirdpartydeps/jdbc/h2-ddl.sql")));
    runnerDDLDML.runScript(new BufferedReader(
            new FileReader("./thirdpartydeps/jdbc/h2-dml.sql")));
    JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator,
            JdbcToArrowUtils.getUtcCalendar())
            .setTargetBatchSize(2)
            .setArraySubTypeByColumnNameMap(
                    new HashMap<>() {{
                        put("LIST_FIELD19",
                                new JdbcFieldInfo(Types.INTEGER));
                    }}
            )
            .build();
    try (ResultSet resultSet = connection.createStatement().executeQuery(
            "SELECT int_field1, bool_field2, bigint_field5, char_field16, list_field19 FROM TABLE1");
         ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator(
                 resultSet, config)) {
        while (iterator.hasNext()) {
            try (VectorSchemaRoot root = iterator.next()) {
                System.out.print(root.contentToTSVString());
            }
        }
    }
} catch (SQLException | IOException e) {
    e.printStackTrace();
}
INT_FIELD1    BOOL_FIELD2    BIGINT_FIELD5    CHAR_FIELD16    LIST_FIELD19
101    true    1000000000300    some char text      [1,2,3]
102    true    100000000030    some char text      [1,2]
INT_FIELD1    BOOL_FIELD2    BIGINT_FIELD5    CHAR_FIELD16    LIST_FIELD19
103    true    10000000003    some char text      [1]

配置數字 (十進制) 的精確度和比例

預設情況下,任何十進位數值的比例必須與欄的 Arrow 類型的定義比例完全相符,否則會擲回一個 UnsupportedOperationException,並顯示類似 BigDecimal scale must equal that in the Arrow vector 的訊息。

會發生這種情況是因為 Arrow 會根據 ResultSet 的資料描述來推斷類型,而這並非適用於所有資料庫驅動程式。JDBC 轉接器讓您可以透過覆寫十進位制比例,或透過 setBigDecimalRoundingMode 提供 RoundingMode,將值轉換為預期的比例,藉此避免這種情況。

在此範例中,我們有一個 BigInt 欄。預設情況下,推斷的比例為 0。我們將比例覆寫成 7,然後設定 RoundingMode 將值轉換為指定的比例。

import org.apache.arrow.adapter.jdbc.ArrowVectorIterator;
import org.apache.arrow.adapter.jdbc.JdbcFieldInfo;
import org.apache.arrow.adapter.jdbc.JdbcToArrow;
import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig;
import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder;
import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.ibatis.jdbc.ScriptRunner;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.math.RoundingMode;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Types;
import java.util.HashMap;

try (BufferAllocator allocator = new RootAllocator();
     Connection connection = DriverManager.getConnection(
             "jdbc:h2:mem:h2-jdbc-adapter")) {
    ScriptRunner runnerDDLDML = new ScriptRunner(connection);
    runnerDDLDML.setLogWriter(null);
    runnerDDLDML.runScript(new BufferedReader(
            new FileReader("./thirdpartydeps/jdbc/h2-ddl.sql")));
    runnerDDLDML.runScript(new BufferedReader(
            new FileReader("./thirdpartydeps/jdbc/h2-dml.sql")));
    JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator,
            JdbcToArrowUtils.getUtcCalendar())
            .setTargetBatchSize(2)
            .setArraySubTypeByColumnNameMap(
                    new HashMap<>() {{
                        put("LIST_FIELD19",
                                new JdbcFieldInfo(Types.INTEGER));
                    }}
            )
            .setExplicitTypesByColumnName(
                    new HashMap<>() {{
                        put("BIGINT_FIELD5",
                                new JdbcFieldInfo(Types.DECIMAL, 20, 7));
                    }}
            )
            .setBigDecimalRoundingMode(RoundingMode.UNNECESSARY)
            .build();
    try (ResultSet resultSet = connection.createStatement().executeQuery(
            "SELECT int_field1, bool_field2, bigint_field5, char_field16, list_field19 FROM TABLE1");
         ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator(
                 resultSet, config)) {
        while (iterator.hasNext()) {
            try (VectorSchemaRoot root = iterator.next()) {
                System.out.print(root.contentToTSVString());
            }
        }
    }
} catch (SQLException | IOException e) {
    e.printStackTrace();
}
INT_FIELD1    BOOL_FIELD2    BIGINT_FIELD5    CHAR_FIELD16    LIST_FIELD19
101    true    1000000000300.0000000    some char text      [1,2,3]
102    true    100000000030.0000000    some char text      [1,2]
INT_FIELD1    BOOL_FIELD2    BIGINT_FIELD5    CHAR_FIELD16    LIST_FIELD19
103    true    10000000003.0000000    some char text      [1]