Description
When generating code for xor(and(A, C), and(B, C))
in taco certain implementations do not match numpy.
The below statement produces the correct code that matches dense numpy and pydata/sparse. For data/image/no/image1.jpg
it produces 816 nnzs:
temp1(i,j) = andOp1(A(i, j), C(i, j));
temp2(i,j) = andOp1(B(i, j), C(i, j));
result(i, j) = xorOp1(temp1(i,j), temp2(i,j));
This (supposedly) equivalent statement produces too many zeros. For data/image/no/image1.jpg
it produces 2646 nnzs:
result(i, j) = xorOp1(andOp1(matrix1(i, j), matrix3(i, j)), andOp1(matrix2(i, j), matrix3(i, j)));
This (supposedly) equivalent statement also produces too many zeros and produces the same result as 2). For data/image/no/image1.jpg
it produces 2646 nnzs:
result(i, j) = xorAndOp(matrix1(i, j), matrix2(i, j), matrix3(i, j));
For case 3) the generated code from the lowerer is:
int compute(taco_tensor_t *result, taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
int result1_dimension = (int)(result->dimensions[0]);
int64_t* restrict result_vals = (int64_t*)(result->vals);
int A1_dimension = (int)(A->dimensions[0]);
int* restrict A2_pos = (int*)(A->indices[1][0]);
int* restrict A2_crd = (int*)(A->indices[1][1]);
int B1_dimension = (int)(B->dimensions[0]);
int* restrict B2_pos = (int*)(B->indices[1][0]);
int* restrict B2_crd = (int*)(B->indices[1][1]);
int C1_dimension = (int)(C->dimensions[0]);
int* restrict C2_pos = (int*)(C->indices[1][0]);
int* restrict C2_crd = (int*)(C->indices[1][1]);
int32_t jresult = 0;
for (int32_t i = 0; i < C1_dimension; i++) {
int32_t jA = A2_pos[i];
int32_t pA2_end = A2_pos[(i + 1)];
int32_t jC = C2_pos[i];
int32_t pC2_end = C2_pos[(i + 1)];
int32_t jB = B2_pos[i];
int32_t pB2_end = B2_pos[(i + 1)];
while ((jA < pA2_end && jC < pC2_end) && jB < pB2_end) {
int32_t jA0 = A2_crd[jA];
int32_t jC0 = C2_crd[jC];
int32_t jB0 = B2_crd[jB];
int32_t j = TACO_MIN(jA0,TACO_MIN(jC0,jB0));
if ((jA0 == j && jC0 == j) && jB0 == j) {
result_vals[jresult] = 1;
jresult++;
}
else if (jB0 == j && jC0 == j) {
result_vals[jresult] = 1;
jresult++;
}
else if (jA0 == j && jC0 == j) {
result_vals[jresult] = 1;
jresult++;
}
jA += (int32_t)(jA0 == j);
jC += (int32_t)(jC0 == j);
jB += (int32_t)(jB0 == j);
}
while (jB < pB2_end && jC < pC2_end) {
int32_t jB0 = B2_crd[jB];
int32_t jC0 = C2_crd[jC];
int32_t j = TACO_MIN(jB0,jC0);
if (jB0 == j && jC0 == j) {
result_vals[jresult] = 1;
jresult++;
}
jB += (int32_t)(jB0 == j);
jC += (int32_t)(jC0 == j);
}
while (jA < pA2_end && jC < pC2_end) {
int32_t jA0 = A2_crd[jA];
int32_t jC0 = C2_crd[jC];
int32_t j = TACO_MIN(jA0,jC0);
if (jA0 == j && jC0 == j) {
result_vals[jresult] = 1;
jresult++;
}
jA += (int32_t)(jA0 == j);
jC += (int32_t)(jC0 == j);
}
}
return 0;
}
Where the case statement if ((jA0 == j && jC0 == j) && jB0 == j)
does not seem correct since the intersection of all three tensors (A, B, and C) should not be included in the defined fused iteration space of xor(and(A, C), and(B, C))
.
The above ops are defined as:
struct Boolean {
ir::Expr operator()(const std::vector<ir::Expr> &v) {
taco_iassert(v.size() >= 1) << "Add operator needs at least one operand";
return ir::Literal::make(int64_t(1), v[0].type());
}
};
struct xorAlgebra {
IterationAlgebra operator()(const std::vector<IndexExpr>& regions) {
IterationAlgebra noIntersect = Complement(Intersect(regions[0], regions[1]));
return Intersect(noIntersect, Union(regions[0], regions[1]));
}
};
struct andAlgebra {
IterationAlgebra operator()(const std::vector<IndexExpr>& regions) {
return Intersect(regions[0], regions[1]);
}
};
struct xorAndAlgebra {
IterationAlgebra operator()(const std::vector<IndexExpr>& regions) {
auto m1 = Intersect(regions[0], regions[2]);
auto m2 = Intersect(regions[1], regions[2]);
auto noIntersect = Complement(Intersect(m1, m2));
return Intersect(noIntersect, Union(m1, m2));
}
};
Func xorOp1("logical_xor", Boolean(), xorAlgebra());
Func andOp1("logical_and", Boolean(), andAlgebra());
Func xorAndOp("fused_xor_and", Boolean(), xorAndAlgebra());