資料操作

與比較、篩選或轉換資料相關的範例。

連接 VectorSchemaRoots

在某些情況下,VectorSchemaRoot 需要被建模為一個容器。為此,您可以使用 VectorSchemaRootAppender.append。以下程式碼建立兩個根部,然後將它們連結在一起

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.arrow.vector.util.VectorSchemaRootAppender;

import static java.util.Arrays.asList;

Field column_one = new Field("column-one", FieldType.nullable(new ArrowType.Int(32, true)), null);
Schema schema = new Schema(asList(column_one));
try (
    BufferAllocator allocator = new RootAllocator();
    VectorSchemaRoot rootOne = VectorSchemaRoot.create(schema, allocator);
    VectorSchemaRoot rootTwo = VectorSchemaRoot.create(schema, allocator);
    VectorSchemaRoot result = VectorSchemaRoot.create(schema, allocator);
) {
    IntVector appenderOne = (IntVector) rootOne.getVector(0);
    rootOne.allocateNew();
    appenderOne.set(0, 100);
    appenderOne.set(1, 20);
    rootOne.setRowCount(2);
    IntVector appenderTwo = (IntVector) rootTwo.getVector(0);
    rootTwo.allocateNew();
    appenderTwo.set(0, 34);
    appenderTwo.set(1, 75);
    rootTwo.setRowCount(2);
    result.allocateNew();
    VectorSchemaRootAppender.append(result, rootOne, rootTwo);
    System.out.print(result.contentToTSVString());
}
column-one
100
20
34
75

連接 Value Vectors

在某些情況下,我們需要將兩個值向量串接成一個。為了完成這項工作,我們可以使用 VectorAppender。這會改變初始的 ValueVector。

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.util.VectorAppender;

try (
    BufferAllocator allocator = new RootAllocator();
    IntVector initialValues = new IntVector("initialValues", allocator);
    IntVector toAppend = new IntVector("toAppend", allocator);
) {
    initialValues.allocateNew(2);
    initialValues.set(0, 1);
    initialValues.set(1, 2);
    initialValues.setValueCount(2);
    System.out.println("Initial IntVector: " + initialValues);
    toAppend.allocateNew(4);
    toAppend.set(1, 4);
    toAppend.set(3, 6);
    toAppend.setValueCount(4);
    System.out.println("IntVector to Append: " + toAppend);
    VectorAppender appenderUtil = new VectorAppender(initialValues);
    toAppend.accept(appenderUtil, null);
    System.out.println("IntVector Result: " + initialValues);
}
Initial IntVector: [1, 2]
IntVector to Append: [null, 4, null, 6]
IntVector Result: [1, 2, null, 4, null, 6]

比較 Vector 的欄位相等性

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.compare.TypeEqualsVisitor;
import org.apache.arrow.memory.RootAllocator;

try(
    BufferAllocator allocator = new RootAllocator();
    IntVector right = new IntVector("int", allocator);
) {
    right.allocateNew(3);
    right.set(0, 10);
    right.set(1, 20);
    right.set(2, 30);
    right.setValueCount(3);
    IntVector left1 = new IntVector("int", allocator);
    IntVector left2 = new IntVector("int2", allocator);
    TypeEqualsVisitor visitor = new TypeEqualsVisitor(right);

    System.out.println(visitor.equals(left1));
    System.out.println(visitor.equals(left2));
}
true
false

比較 Vector 的相等性

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.compare.VectorEqualsVisitor;

try(
    BufferAllocator allocator = new RootAllocator();
    IntVector vector1 = new IntVector("vector1", allocator);
    IntVector vector2 = new IntVector("vector1", allocator);
    IntVector vector3 = new IntVector("vector1", allocator)
) {
    vector1.allocateNew(1);
    vector1.set(0, 10);
    vector1.setValueCount(1);

    vector2.allocateNew(1);
    vector2.set(0, 10);
    vector2.setValueCount(1);

    vector3.allocateNew(1);
    vector3.set(0, 20);
    vector3.setValueCount(1);
    VectorEqualsVisitor visitor = new VectorEqualsVisitor();

    System.out.println(visitor.vectorEquals(vector1, vector2));
    System.out.println(visitor.vectorEquals(vector1, vector3));
}
true
false

在陣列上比較值

比較向量中給定索引上的兩個值

import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.memory.RootAllocator;

try(
    BufferAllocator allocator = new RootAllocator();
    VarCharVector vec = new VarCharVector("valueindexcomparator", allocator);
) {
    vec.allocateNew(3);
    vec.setValueCount(3);
    vec.set(0, "ba".getBytes());
    vec.set(1, "abc".getBytes());
    vec.set(2, "aa".getBytes());
    VectorValueComparator<VarCharVector> valueComparator = DefaultVectorComparators.createDefaultComparator(vec);
    valueComparator.attachVector(vec);

    System.out.println(valueComparator.compare(0, 1) > 0);
    System.out.println(valueComparator.compare(1, 2) < 0);
}
true
false

請考慮到,如果我們需要自己的比較器,我們可以擴充 VectorValueComparator,並根據需要覆寫 compareNotNull 方法

在陣列上搜尋值

線性搜尋 - O(n)

演算法:org.apache.arrow.algorithm.search.VectorSearcher#linearSearch - O(n)

import org.apache.arrow.algorithm.search.VectorSearcher;
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;

try(
    BufferAllocator allocator = new RootAllocator();
    IntVector linearSearchVector = new IntVector("linearSearchVector", allocator);
) {
    linearSearchVector.allocateNew(10);
    linearSearchVector.setValueCount(10);
    for (int i = 0; i < 10; i++) {
        linearSearchVector.set(i, i);
    }
    VectorValueComparator<IntVector> comparatorInt = DefaultVectorComparators.createDefaultComparator(linearSearchVector);
    int result = VectorSearcher.linearSearch(linearSearchVector, comparatorInt, linearSearchVector, 3);

    System.out.println(result);
}
3

二元搜尋 - O(log(n))

演算法:org.apache.arrow.algorithm.search.VectorSearcher#binarySearch - O(log(n))

import org.apache.arrow.algorithm.search.VectorSearcher;
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;

try(
    BufferAllocator allocator = new RootAllocator();
    IntVector binarySearchVector = new IntVector("", allocator);
) {
    binarySearchVector.allocateNew(10);
    binarySearchVector.setValueCount(10);
    for (int i = 0; i < 10; i++) {
        binarySearchVector.set(i, i);
    }
    VectorValueComparator<IntVector> comparatorInt = DefaultVectorComparators.createDefaultComparator(binarySearchVector);
    int result = VectorSearcher.binarySearch(binarySearchVector, comparatorInt, binarySearchVector, 3);

    System.out.println(result);
}
3

在陣列上排序值

原地排序器 - O(nlog(n))

透過操作原始向量進行排序。演算法:org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter - O(nlog(n))

import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;

try(
    BufferAllocator allocator = new RootAllocator();
    IntVector intVectorNotSorted = new IntVector("intvectornotsorted", allocator);
) {
    intVectorNotSorted.allocateNew(3);
    intVectorNotSorted.setValueCount(3);
    intVectorNotSorted.set(0, 10);
    intVectorNotSorted.set(1, 8);
    intVectorNotSorted.setNull(2);
    FixedWidthInPlaceVectorSorter<IntVector> sorter = new FixedWidthInPlaceVectorSorter<IntVector>();
    VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(intVectorNotSorted);
    sorter.sortInPlace(intVectorNotSorted, comparator);

    System.out.println(intVectorNotSorted);
}
[null, 8, 10]

異地排序器 - O(nlog(n))

將向量元素複製到新的已排序向量進行排序 - O(nlog(n)) 演算法::org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter。 FixedWidthOutOfPlaceVectorSorter & VariableWidthOutOfPlaceVectorSor

import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.FixedWidthOutOfPlaceVectorSorter;
import org.apache.arrow.algorithm.sort.OutOfPlaceVectorSorter;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;

try(
    BufferAllocator allocator = new RootAllocator();
    IntVector intVectorNotSorted = new IntVector("intvectornotsorted", allocator);
    IntVector intVectorSorted = (IntVector) intVectorNotSorted.getField()
            .getFieldType().createNewSingleVector("new-out-of-place-sorter",
                    allocator, null);

) {
    intVectorNotSorted.allocateNew(3);
    intVectorNotSorted.setValueCount(3);
    intVectorNotSorted.set(0, 10);
    intVectorNotSorted.set(1, 8);
    intVectorNotSorted.setNull(2);
    OutOfPlaceVectorSorter<IntVector> sorterOutOfPlaceSorter = new FixedWidthOutOfPlaceVectorSorter<>();
    VectorValueComparator<IntVector> comparatorOutOfPlaceSorter = DefaultVectorComparators.createDefaultComparator(intVectorNotSorted);
    intVectorSorted.allocateNew(intVectorNotSorted.getValueCount());
    intVectorSorted.setValueCount(intVectorNotSorted.getValueCount());
    sorterOutOfPlaceSorter.sortOutOfPlace(intVectorNotSorted, intVectorSorted, comparatorOutOfPlaceSorter);

    System.out.println(intVectorSorted);
}
[null, 8, 10]