#	User	Rating
1	jiangly	3898
2	tourist	3840
3	orzdevinwang	3706
4	ksun48	3691
5	jqdai0815	3682
6	ecnerwala	3525
7	gamegame	3477
8	Benq	3468
9	Ormlis	3381
10	maroonrk	3379

#	User	Contrib.
1	cry	168
2	-is-this-fft-	165
3	Dominater069	161
4	atcoder_official	160
5	Um_nik	159
6	djm03178	157
7	adamant	153
8	luogu_official	151
9	awoo	149
10	TheScrasse	146

In this blog you can found an efficient implementation of Karatsuba multiply of two polinomials, that sometimes can be used instead of FFT

$$$\text{ }$$$

Hello everyone!

One man ask me to write a code for calculation of $$$1.000.000$$$ fibonacci number in C++. This broblem has been solved in time less than 1 second with Karatsuba multiply. After it, I tried to use this code in standart problems like "we have two strings of letters $$$\text{ATGC}$$$ and want to calculate optimal cyclic rotation of one of them so number of equal positions will be greatest" with Karatsuba multiply, and these problems have been successfully solved. There are a lot of naive implementations, but MrDindows helped to write a most efficient of all simple implementations.

Idea of Karatsuba multiply

We have two polinomials $$$a(x)$$$ and $$$b(x)$$$ with equal length $$$2n$$$ and want multiply them. Let suppose that $$$a(x) = a_0(x) + x^n \cdot a_1(x)$$$ and $$$b(x) = b_0(x) + x^n \cdot b_1(x)$$$. Now we can calculate result of $$$a(x) \cdot b(x)$$$ in next steps:

Calculate $$$E(x) = (a_0(x) + a_1(x)) \cdot (b_0(x) + b_1(x))$$$
Calculate $$$r_0(x)=a_0(x)\cdot b_0(x)$$$
Calculate $$$r_2(x)=a_1(x) \cdot b_1(x)$$$
Now, result is $$$a(x) \cdot b(x) = r_0(x) + x^n \cdot \left(E(x) - r_0(x) - r_2(x)\right) + x^{2n} \cdot r_2(x)$$$

We can see, that instead of $$$4$$$ multiplications we used only $$$3$$$, so we have time $$$O(n^{\log_2(3)})$$$ or $$$O(n^{1.58})$$$

Efficient implementation

#pragma GCC optimize("Ofast,unroll-loops")
#pragma GCC target("avx,avx2,fma") 
namespace {
    template<int n, typename T>
    void mult(const T *__restrict a, const T *__restrict b, T *__restrict res) {
        if (n <= 64) { // if length is small then naive multiplication if faster
            for (int i = 0; i < n; i++) {
                for (int j = 0; j < n; j++) {
                    res[i + j] += a[i] * b[j];
                }
            }
        } else {
            const int mid = n / 2;
            alignas(64) T btmp[n], E[n] = {};
            auto atmp = btmp + mid;
            for (int i = 0; i < mid; i++) {
                atmp[i] = a[i] + a[i + mid]; // atmp(x) - sum of two halfs a(x)
                btmp[i] = b[i] + b[i + mid]; // btmp(x) - sum of two halfs b(x)
            }
            mult<mid>(atmp, btmp, E); // Calculate E(x) = (alow(x) + ahigh(x)) * (blow(x) + bhigh(x))
            mult<mid>(a + 0, b + 0, res); // Calculate rlow(x) = alow(x) * blow(x)
            mult<mid>(a + mid, b + mid, res + n); // Calculate rhigh(x) = ahigh(x) * bhigh(x)
            for (int i = 0; i < mid; i++) { // Then, calculate rmid(x) = E(x) - rlow(x) - rhigh(x) and write in memory
                const auto tmp = res[i + mid];
                res[i + mid] += E[i] - res[i] - res[i + 2 * mid];
                res[i + 2 * mid] += E[i + mid] - tmp - res[i + 3 * mid];
            }
        }
    }
}

Testing

Example of solutions for real problems: 528D. Fuzzy Search and AtCoder Beginner Contest 149 E. Handshake.

Stress-test 512K for 64-bit and 32-bit coefficients

It is possible that this code can be used instead of FFT, but be careful: for multiplication of two polinomials with length $$$2^{19}$$$ Karatsuba multiply requires $$$3.047.478.784$$$ operations. Of cource, it can be executed in 1 second, but for greater length...

#pragma GCC optimize("Ofast,unroll-loops") #pragma GCC target("avx,avx2,fma") #include <bits/stdc++.h> namespace { template<int n, typename T> void mult(const T *__restrict a, const T *__restrict b, T *__restrict res) { if (n <= 64) { // if length is small then naive multiplication if faster for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { res[i + j] += a[i] * b[j]; } } } else { const int mid = n / 2; alignas(64) T btmp[n], E[n] = {}; auto atmp = btmp + mid; for (int i = 0; i < mid; i++) { atmp[i] = a[i] + a[i + mid]; // atmp(x) - sum of two halfs a(x) btmp[i] = b[i] + b[i + mid]; // btmp(x) - sum of two halfs b(x) } mult<mid>(atmp, btmp, E); // Calculate E(x) = (alow(x) + ahigh(x)) * (blow(x) + bhigh(x)) mult<mid>(a + 0, b + 0, res); // Calculate rlow(x) = alow(x) * blow(x) mult<mid>(a + mid, b + mid, res + n); // Calculate rhigh(x) = ahigh(x) * bhigh(x) for (int i = 0; i < mid; i++) { // Then, calculate rmid(x) = E(x) - rlow(x) - rhigh(x) and write in memory const auto tmp = res[i + mid]; res[i + mid] += E[i] - res[i] - res[i + 2 * mid]; res[i + 2 * mid] += E[i + mid] - tmp - res[i + 3 * mid]; } } } } using namespace std; #define REP(i, n) for(int i = 0; i < (n); ++i) // Complexity O(A * B^0.6) where A >= B. template<typename T> void rec_kara(T* a, int one, T* b, int two, T* r) { // by UW/Errichto if(min(one, two) <= 20) { // must be at least "<= 1" REP(i, one) REP(j, two) r[i+j] += a[i] * b[j]; return; } const int x = min(one, two); if(one < two) rec_kara(a, x, b + x, two - x, r + x); if(two < one) rec_kara(a + x, one - x, b, x, r + x); const int n = (x + 1) / 2, right = x / 2; vector<T> tu(2 * n); rec_kara(a, n, b, n, tu.data()); REP(i, 2 * n - 1) { r[i] += tu[i]; r[i+n] -= tu[i]; tu[i] = 0; } rec_kara(a + n, right, b + n, right, tu.data()); REP(i, 2 * right - 1) r[i+n] -= tu[i], r[i+2*n] += tu[i]; tu[n-1] = a[n-1]; tu[2*n-1] = b[n-1]; REP(i, right) tu[i] = a[i]+a[i+n], tu[i+n] = b[i]+b[i+n]; rec_kara(tu.data(), n, tu.data() + n, n, r + n); } template<typename T> vector<T> karatsuba(vector<T> a, vector<T> b) { if(a.empty() || b.empty()) return {}; vector<T> r(a.size() + b.size() - 1); rec_kara(a.data(), a.size(), b.data(), b.size(), r.data()); return r; } // ======================================================= vector<int> r(int n) { vector<int> w; for(int i = 0; i < n; ++i) w.push_back(rand() % 10); return w; } void rand_range(int *begin, int *end) { while (begin != end) *begin++ = rand() % 10; } int main() { //vector<int> a = {1,2,3}; //vector<int> b = {5,0,20,500}; //vector<int> r = karatsuba(a, b); //output: 5 37 35 24 113 88 110 174 121 242 const int n = 200123; const int NMAX = 49 << 12; assert(n <= NMAX); //vector<int> a = r(n), b = r(n); //~ vector<int> a = r(500123), b = r(5817); alignas(64) static int a[NMAX], b[NMAX], r[2 * NMAX]; rand_range(a,a+n); rand_range(b,b+n); mult<NMAX>(a,b,r); //vector<int> r = karatsuba(a, b); //for(int i = 0; i < min(10, (int) r.size()); ++i) cout << r[i] << " "; for(int i = 0; i < min(10, 2 * n - 1); ++i) cout << r[i] << " "; cout << "\n"; }

Comments (16)

Write comment?

Xellos

5 years ago, # |

Don't forget that you also need modulo for most problems where polynomial multiplication is used.

→ Reply

armoking

5 years ago, # ^ |

This is one more reason to use Karastuba mult It works good with "bad" fft modulo

Yeah, that's one of the big reasons why I prefer Karatsuba over FFT in most situations. Just pointing out that it matters a bit for benchmarks and that you can't use this exact code right away because of overflows.

adamant

+43

Can you make a benchmark comparing it with FFT?

dmkozyrev

← Rev. 2 →

+26

Code for testing

Codeforces results

Multiplication of two polinomials $$$a(x)$$$ and $$$b(x)$$$ with degree equal to $$$\text{NMAX}-1$$$.

NMAX =     524288, karatsuba:   926ms, FFT:   531ms
NMAX =     262144, karatsuba:   332ms, FFT:   249ms
NMAX =     131072, karatsuba:   109ms, FFT:   120ms
NMAX =      65536, karatsuba:    34ms, FFT:    55ms
NMAX =      32768, karatsuba:    10ms, FFT:...
=====
Used: 2448 мс, 69500 КБ

bicsi

What is the reasoning behind allocating the temporary arrays inside the function body? Isn't there a problem with big array allocations on the stack making more cache misses (and with possible stack size limitations of other OJs)? What happens to the benchmarks if you preallocate those arrays?

Cache misses have nothing to do with stack usage. It's the order of data accesses that matters. As an extreme example, if you allocate a 1 GB array on the stack, but don't access it at all, then cache hits/misses are exactly the same because the "allocation" is only a change in upper bits in the stack register.

In fact, the fixed nature of this recursion (let's assume sizes $$$2^n$$$, it doesn't hurt) means that this "allocation" inside functions really is just giving purpose to parts of the stack. When we're at depth $$$d$$$, the locations of these arrays are always the same regardless of the branch of recursion, and when we move to an adjacent depth, these arrays are also adjacent. That means it's pretty good cache-wise at high depths, where we have a lot of recursion.

That makes sense, I was more referring to the fact that if you allocate variables both before the array allocation and after it, the ones after will not be colocated with the ones before, so to speak. That might not be a problem at all (I'm not sure, to be honest), but still, some configurations limit the stack size and therefore huge arrays allocated on the stack might be problematic (?)

+10

Yeah, with 8 MB stacks, this can fail. Easy fix: allocate one chunk on the heap and assign just pointers to each recursion level, it'll do the exact same thing. ¯\_(ツ)_/¯

In Karatsuba, cache isn't much of a bottleneck though. It can be if you jump around in memory too much at high depths, but that's not the case here. Just look at those beautiful linear passes. That's where most work is done and the bottleneck will be mostly the number of add/multiply operations and how well they can be pipelined. Even more so when modulo is involved.

madhur4127

More optimizations: 1. Using Comba multiplier instead of naive multiplication (around 1.35 times faster) 2. Pre-allocating 4*N space up front to hold the temporaries instead of stack-based allocation.

frogcatcher

What is Comba multiplier?

Page 4 of the PDF in Optimization in Small.

https://pdfs.semanticscholar.org/8b2b/98385baad2bdab2944f4e5f725ee237ca94f.pdf by Paul G Comba, Exponentiation Cryptosystems on the IBM PC, 1990.

Errichto

← Rev. 3 →

Mine is slower (1s for multiplying vectors of size $$$200\,000$$$) but can handle different sizes of two arrays and it doesn't use vectorization so maybe can be improved nicely too. The complexity is $$$O(max(s1, s2) \cdot min(s1, s2)^{0.6})$$$. It works better than FFT for arrays smaller than maybe $$$30\,000$$$ and/or if sizes differ a lot.

code

#include <bits/stdc++.h>
using namespace std;
#define REP(i, n) for(int i = 0; i < (n); ++i)
// Complexity O(A * B^0.6) where A >= B.
template<typename T> void rec_kara(T* a, int one, T* b, int two, T* r) { // by UW/Errichto
	if(min(one, two) <= 20) { // must be at least "<= 1"
		REP(i, one) REP(j, two) r[i+j] += a[i] * b[j];
		return;
	}
	const int x = min(one, two);
	if(one < two) rec_kara(a, x, b + x, two - x, r + x);
	if(two < one) rec_kara(a + x, one - x, b, x, r + x);
	const int n = (x + 1) / 2, right = x / 2;
	vector<T> tu(2 * n);
	rec_kara(a, n, b, n, tu.data());
	REP(i, 2 * n - 1) {
		r[i] += tu[i];
		r[i+n] -= tu[i];
		tu[i] = 0;
	}
	rec_kara(a + n, right, b + n, right, tu.data());
	REP(i, 2 * right - 1) r[i+n] -= tu[i], r[i+2*n] += tu[i];
	tu[n-1] = a[n-1]; tu[2*n-1] = b[n-1];
	REP(i, right) tu[i] = a[i]+a[i+n], tu[i+n] = b[i]+b[i+n];
	rec_kara(tu.data(), n, tu.data() + n, n, r + n);
}
template<typename T> vector<T> karatsuba(vector<T> a, vector<T> b) {
	if(a.empty() || b.empty()) return {};
	vector<T> r(a.size() + b.size() - 1);
	rec_kara(a.data(), a.size(), b.data(), b.size(), r.data());
	return r;
}
// =======================================================
vector<int> r(int n) {
	vector<int> w;
	for(int i = 0; i < n; ++i) w.push_back(rand() % 10);
	return w;
}
int main() {
	//vector<int> a = {1,2,3};
	//vector<int> b = {5,0,20,500};
	//vector<int> r = karatsuba(a, b);
	vector<int> a = r(200123), b = r(200123);
	//~ vector<int> a = r(500123), b = r(5817);
	vector<int> r = karatsuba(a, b);
	for(int i = 0; i < min(10, (int) r.size()); ++i) cout << r[i] << " ";
	cout << "\n";
}

Thanks for sharing your implementation. On your testcase ($$$200123$$$ random ints from $$$0$$$ to $$$9$$$) implementation from blog with static arrays works in $$$218\text{ ms}$$$

I will think about template for not equal lengths both powers of two.

SPyofgame

Wow, nice <3

MarioYC

22 months ago, # |

-18

How could this code be modified to work in some modulo? I've tried to modify it for this problem about polynomial multiplication, but got TLE (you can find the code for my submission here). In this case using int alone gives WA because of overflow, so I had to use an intermediate casting to long long before applying the modulo.

dmkozyrev's blog

Idea of Karatsuba multiply

Efficient implementation

Testing