LibGfx+LibPDF: Simpler and faster N-D linear sampling

Previously, if we wanted to to e.g. do linear interpolation in 2-D,
we'd get a sample point like (1.3, 4.4), then get 4 samples around
it at (1, 4), (2, 4), (1, 5), (2, 5), then reduce the 4 samples
to 2 samples by computing the combined samples
`0.3 * f(1, 4) + 0.7 * f(2, 4)` and `0.3 * f(1, 5) + 0.8 * f(2, 5)`,
and then 1-D linearly blending between these two samples with the
factor 0.4. In the end we'd multiply the first value by 0.3 * 0.4,
the second by 0.7 * 0.4, the third by 0.3 * 0.6, and the third by
0.7 * 0.6, and then sum them all up.

This requires computing and storing 2**N samples, followed by
another 2**N iterations to combine the 2**N sampls to a single value.
(N is in practice either 4 or 3, so 2**N isn't super huge.)

Instead, for every sample we can directly compute the product of
weights and sum them up directly. This lets us omit the second loop
and storing 2**N values, in exchange for doing an additional O(n)
work to compute the product.

Takes

    Build/lagom/bin/image --no-output --invert-cmyk \
        --assign-color-profile \
            Build/lagom/Root/res/icc/Adobe/CMYK/USWebCoatedSWOP.icc \
        --convert-to-color-profile serenity-sRGB.icc \
        cmyk.jpg

form 3.42s to 3.08s on my machine, almost 10% faster (and less code).

Here cmyk.jpg is a 2253x3080 cmyk jpeg, and USWebCoatedSWOP.icc is an
mft2 profile with input tables with 256 samples and a 9x9x9x9 CLUT.

The LibPDF change is covered by TEST_CASE(sampled) in LibPDF.cpp,
and the LibGfx change is basically the same change as the one in
LibPDF (where the test results don't change) and the output
subjectively looks identical. So hopefully this causes indeed no
behavior change :^)
This commit is contained in:
Nico Weber 2024-02-02 12:05:12 -05:00 committed by Jelle Raaijmakers
parent c4780f4ee1
commit f562c470e2
2 changed files with 16 additions and 20 deletions

View file

@ -46,23 +46,20 @@ inline FloatVector3 lerp_nd(Function<unsigned(size_t)> size, Function<FloatVecto
factor.append(ec - left_index[i]);
}
Vector<FloatVector3> samples;
samples.resize(1u << x.size());
FloatVector3 sample_output {};
// The i'th bit of mask indicates if the i'th coordinate is rounded up or down.
Vector<unsigned> coordinates;
coordinates.resize(x.size());
for (size_t mask = 0; mask < (1u << x.size()); ++mask) {
for (size_t i = 0; i < x.size(); ++i)
float sample_weight = 1.0f;
for (size_t i = 0; i < x.size(); ++i) {
coordinates[i] = left_index[i] + ((mask >> i) & 1u);
samples[mask] = sample(coordinates);
sample_weight *= ((mask >> i) & 1u) ? factor[i] : 1.0f - factor[i];
}
sample_output += sample(coordinates) * sample_weight;
}
for (int i = static_cast<int>(x.size() - 1); i >= 0; --i) {
for (size_t mask = 0; mask < (1u << i); ++mask)
samples[mask] = mix(samples[mask], samples[mask | (1u << i)], factor[i]);
}
return samples[0];
return sample_output;
}
using S15Fixed16 = FixedPoint<16, i32>;

View file

@ -206,23 +206,22 @@ PDFErrorOr<ReadonlySpan<float>> SampledFunction::evaluate(ReadonlySpan<float> xs
// then 2 by interpolating along y, then 1 by interpolating along x.
// So for the general case, we create 2**N samples, and then for each coordinate, we cut the number of samples in half
// by interpolating along that coordinate.
Vector<float> samples;
samples.resize(1 << m_domain.size());
// Instead of storing all the 2**N samples, we can calculate the product of weights for each corner,
// and sum up the weighted samples.
float sample_output = 0;
// The i'th bit of mask indicates if the i'th coordinate is rounded up or down.
Vector<int> coordinates;
coordinates.resize(m_domain.size());
for (size_t mask = 0; mask < (1u << m_domain.size()); ++mask) {
for (size_t i = 0; i < m_domain.size(); ++i)
float sample_weight = 1.0f;
for (size_t i = 0; i < m_domain.size(); ++i) {
coordinates[i] = m_left_index[i] + ((mask >> i) & 1u);
samples[mask] = sample(coordinates, r);
sample_weight *= ((mask >> i) & 1u) ? m_inputs[i] : (1.0f - m_inputs[i]);
}
sample_output += sample(coordinates, r) * sample_weight;
}
for (int i = static_cast<int>(m_domain.size() - 1); i >= 0; --i) {
for (size_t mask = 0; mask < (1u << i); ++mask)
samples[mask] = mix(samples[mask], samples[mask | (1 << i)], m_inputs[i]);
}
float result = interpolate(samples[0], 0.0f, 255.0f, m_decode[r].lower, m_decode[r].upper);
float result = interpolate(sample_output, 0.0f, 255.0f, m_decode[r].lower, m_decode[r].upper);
m_outputs[r] = clamp(result, m_range[r].lower, m_range[r].upper);
}