std::deque unusually high memory usage

#	User	Rating
1	tourist	3856
2	jiangly	3747
3	orzdevinwang	3706
4	jqdai0815	3682
5	ksun48	3591
6	gamegame	3477
7	Benq	3468
8	Radewoosh	3462
9	ecnerwala	3451
10	heuristica	3431

#	User	Contrib.
1	cry	167
2	-is-this-fft-	162
3	Dominater069	160
4	Um_nik	158
5	atcoder_official	157
6	Qingyu	156
7	adamant	151
7	djm03178	151
7	luogu_official	151
10	awoo	146

101911E - Painting the Fence

When I first got MLE, I suspected it might be because of #define int long long, but even after changing that I still got MLE. I couldn't figure out what was wrong as I was sure that the space was O(n).

When I used std::set instead of std::deque it used only 100MB memory.

std::deque code

...

#include <bits/stdc++.h>
#include <ext/pb_ds/assoc_container.hpp>
#include <ext/pb_ds/tree_policy.hpp>

// #define int long long
#define float double
#define all(x) (x).begin(), (x).end()
#define pnl cout << "\n"
#ifndef ONLINE_JUDGE
#define dbg(x) cerr << (#x) << " " << (x) << "\n"
#else
#define dbg(x) 42
#endif

using namespace std;
using namespace __gnu_pbds;

template <typename T>
using ordered_set = tree<T, null_type, less<T>, rb_tree_tag, tree_order_statistics_node_update>;
using vi = vector<int>;
using vvi = vector<vi>;
using ii = array<int, 2>;
using vii = vector<ii>;

template <typename T>
istream &operator>>(istream &in, vector<T> &a) {
    for (auto &x : a) in >> x;
    return in;
}
template <typename T>
ostream &operator<<(ostream &out, vector<T> &a) {
    for (auto &x : a) out << x << " ";
    return out;
}

mt19937_64 rng(chrono::steady_clock::now().time_since_epoch().count());

const int inf = numeric_limits<int>::max();
struct Node {
    Node *l = 0, *r = 0;
    int lo, hi, mset = inf, madd = 0, val = 0;
    Node(int lo,int hi):lo(lo),hi(hi){} // Large interval of 0
    Node(vector<int>& v, int lo, int hi) : lo(lo), hi(hi) {
        if (lo + 1 < hi) {
            int mid = lo + (hi - lo) / 2;
            l = new Node(v, lo, mid); r = new Node(v, mid, hi);
            val = l->val + r->val;
        }
        else val = v[lo];
    }
    int query(int L, int R) {
        if (R <= lo || hi <= L) return 0;
        if (L <= lo && hi <= R) return val;
        push();
        return l->query(L, R) + r->query(L, R);
    }
    void set(int L, int R, int x) {
        if (R <= lo || hi <= L) return;
        if (L <= lo && hi <= R) {
            mset = x;
            madd = 0;
            val = x * (hi - lo);
        }
        else {
            push();
            l->set(L, R, x);
            r->set(L, R, x);
            val = l->val + r->val;
        }
    }
    void add(int L, int R, int x) {
        if (R <= lo || hi <= L) return;
        if (L <= lo && hi <= R) {
            if (mset != inf) mset += x;
            else madd += x;
            val += x * (hi - lo);
        }
        else {
            push();
            l->add(L, R, x);
            r->add(L, R, x);
            val = l->val + r->val;
        }
    }
    void push() {
        if (!l) {
            int mid = lo + (hi - lo) / 2;
            l = new Node(lo, mid); r = new Node(mid, hi);
        }
        if (mset != inf) {
            l->set(lo, hi, mset);
            r->set(lo, hi, mset);
            mset = inf;
        }
        else if (madd) {
            l->add(lo, hi, madd);
            r->add(lo, hi, madd);
            madd = 0;
        }
    }
};

void solve(int testcase) {
    int n; cin >> n;
    vi a(n); cin >> a;

    map<int, deque<int>> pos;
    for (int i = 0; i < n; i++) {
        pos[a[i]].push_back(i);
    }

    vi op(n);
    Node st(a, 0, n);
    Node st2(op, 0, n);

    int m; cin >> m;
    while (m--) {
        int clr; cin >> clr;
        if (pos.find(clr) == pos.end()) continue;

        auto &dq = pos[clr];
        if (dq.size() < 2) continue;

        while (dq.size()) {
            int l = dq.front();
            if (st2.query(l, l + 1)) {
                dq.pop_front();
                continue;
            }
            break;
        }
        while (dq.size()) {
            int r = dq.back();
            if (st2.query(r, r + 1)) {
                dq.pop_back();
                continue;
            }
            break;
        }
        if (dq.size() >= 2) {
            int l = dq.front();
            int r = dq.back();
            st.set(l, r + 1, a[l]);
            if (r > l + 1) st2.add(l + 1, r, 1);
        }
    }

    for (int i = 0; i < n; i++) {
        cout << st.query(i, i + 1) << " ";
    }
    pnl;
}

int32_t main() {
    using namespace chrono;
    ios::sync_with_stdio(false); cin.tie(nullptr); cout.tie(nullptr);
    auto start = high_resolution_clock::now();
    int T = 1; // cin >> T;
    for (int t = 1; t <= T; t++) {
        solve(t);
    }
    auto end = high_resolution_clock::now();
    auto duration = duration_cast<milliseconds>(end - start);
#ifndef ONLINE_JUDGE
    cerr << "Time: " << duration.count() << " ms\n";
#endif
    return 0;
}

std::set code

...

#include <bits/stdc++.h>
#include <ext/pb_ds/assoc_container.hpp>
#include <ext/pb_ds/tree_policy.hpp>

// #define int long long
#define float double
#define all(x) (x).begin(), (x).end()
#define pnl cout << "\n"
#ifndef ONLINE_JUDGE
#define dbg(x) cerr << (#x) << " " << (x) << "\n"
#else
#define dbg(x) 42
#endif

using namespace std;
using namespace __gnu_pbds;

template <typename T>
using ordered_set = tree<T, null_type, less<T>, rb_tree_tag, tree_order_statistics_node_update>;
using vi = vector<int>;
using vvi = vector<vi>;
using ii = array<int, 2>;
using vii = vector<ii>;

template <typename T>
istream &operator>>(istream &in, vector<T> &a) {
    for (auto &x : a) in >> x;
    return in;
}
template <typename T>
ostream &operator<<(ostream &out, vector<T> &a) {
    for (auto &x : a) out << x << " ";
    return out;
}

mt19937_64 rng(chrono::steady_clock::now().time_since_epoch().count());

const int inf = numeric_limits<int>::max();
struct Node {
    Node *l = 0, *r = 0;
    int lo, hi, mset = inf, madd = 0, val = 0;
    Node(int lo,int hi):lo(lo),hi(hi){} // Large interval of 0
    Node(vector<int>& v, int lo, int hi) : lo(lo), hi(hi) {
        if (lo + 1 < hi) {
            int mid = lo + (hi - lo) / 2;
            l = new Node(v, lo, mid); r = new Node(v, mid, hi);
            val = l->val + r->val;
        }
        else val = v[lo];
    }
    int query(int L, int R) {
        if (R <= lo || hi <= L) return 0;
        if (L <= lo && hi <= R) return val;
        push();
        return l->query(L, R) + r->query(L, R);
    }
    void set(int L, int R, int x) {
        if (R <= lo || hi <= L) return;
        if (L <= lo && hi <= R) {
            mset = x;
            madd = 0;
            val = x * (hi - lo);
        }
        else {
            push();
            l->set(L, R, x);
            r->set(L, R, x);
            val = l->val + r->val;
        }
    }
    void add(int L, int R, int x) {
        if (R <= lo || hi <= L) return;
        if (L <= lo && hi <= R) {
            if (mset != inf) mset += x;
            else madd += x;
            val += x * (hi - lo);
        }
        else {
            push();
            l->add(L, R, x);
            r->add(L, R, x);
            val = l->val + r->val;
        }
    }
    void push() {
        if (!l) {
            int mid = lo + (hi - lo) / 2;
            l = new Node(lo, mid); r = new Node(mid, hi);
        }
        if (mset != inf) {
            l->set(lo, hi, mset);
            r->set(lo, hi, mset);
            mset = inf;
        }
        else if (madd) {
            l->add(lo, hi, madd);
            r->add(lo, hi, madd);
            madd = 0;
        }
    }
};

void solve(int testcase) {
    int n; cin >> n;
    vi a(n); cin >> a;

    map<int, set<int>> pos;
    for (int i = 0; i < n; i++) {
        pos[a[i]].insert(i);
    }

    vi op(n);
    Node st(a, 0, n);
    Node st2(op, 0, n);

    int m; cin >> m;
    while (m--) {
        int clr; cin >> clr;
        if (pos.find(clr) == pos.end()) continue;

        auto &dq = pos[clr];
        if (dq.size() < 2) {
            pos.erase(clr);
            continue;
        }

        while (dq.size()) {
            int l = *dq.begin();
            if (st2.query(l, l + 1)) {
                dq.erase(dq.begin());
                continue;
            }
            break;
        }
        while (dq.size()) {
            int r = *prev(dq.end());
            if (st2.query(r, r + 1)) {
                dq.erase(prev(dq.end()));
                continue;
            }
            break;
        }
        if (dq.size() >= 2) {
            int l = *dq.begin();
            int r = *prev(dq.end());
            st.set(l, r + 1, a[l]);
            if (r > l + 1) st2.add(l + 1, r, 1);
        } else {
            pos.erase(clr);
        }
    }

    for (int i = 0; i < n; i++) {
        cout << st.query(i, i + 1) << " ";
    }
    pnl;
}

int32_t main() {
    using namespace chrono;
    ios::sync_with_stdio(false); cin.tie(nullptr); cout.tie(nullptr);
    auto start = high_resolution_clock::now();
    int T = 1; // cin >> T;
    for (int t = 1; t <= T; t++) {
        solve(t);
    }
    auto end = high_resolution_clock::now();
    auto duration = duration_cast<milliseconds>(end - start);
#ifndef ONLINE_JUDGE
    cerr << "Time: " << duration.count() << " ms\n";
#endif
    return 0;
}

Can anybody explain why std::deque has such high memory usage?

Comments (13)

Write comment?

Aarush

4 months ago, # |

Auto comment: topic has been updated by Aarush (previous revision, new revision, compare).

→ Reply

negative-xp

deque isn't stored contiguously in memory

retard.

4 months ago, # ^ |

This doesn't affect memory consumption though, only cache utilization.

bIeah

+19

Because its retarded. Deque needs like 2 kilos on its own to prepare for a lot of the shenanigans it can maintain (i.e. random access in constant time). All its subderivates also have the same thing (i.e. queue and stack). So try your best not to use them if you have a lot of them instantiated in parallel.

sparsetable

As a fun fact, you can force stacks to use std::vector by initializing them as stack<int,vector<int>>, but I don't know how this affects performance.

pajenegod

+37

Because its retarded. Deque needs like 2 kilos on its own to prepare for a lot of the shenanigans it can maintain (i.e. random access in constant time)

Random access in constant time is not the reason for why deque is memory hungry...

Deque internally allocates data in constant sized blocks. Depending on which implementation of the standard library you use, the block size can vary a lot.

According to https://devblogs.microsoft.com/oldnewthing/20230810-00/?p=108587, libstdc++ (gcc) uses 512 bytes per block, libc++ (clang) uses 4096 bytes per block, and msvc uses 16 bytes. (I don't know why msvc went with just 16 bytes. It completely kills the performance of its deque.)

Unlike a vector, deque simply allocates a new block everytime it fills up. This makes it so any pointer/reference to an element in a deque is never invalidated on push_back (this is the main feature of deque!). This feature can be very handy if you are working with pointers. Note also that if you want to store a lot of elements, a deque can be significantly more memory efficient compared to a vector since a deque doesn't have to over allocate by as much.

shiven

+28

I'm sure a quick Google search would get you to quite a few instances of people crying about deque's memory usage, and some SE answers trying to explain why, so I won't attempt to do that.

As a generally helpful replacement for deque, std::list exists. We tend to largely ignore Linked lists in CP, but the STL implementation is fairly good at basic stuff like popping or inserting at both sides. This fortunately covers almost all uses of deque in most problems, and with less funky behaviour in my experience. It doesn't support O(1) random access though, and it doesn't look like you need it either. Your current code ACs with <100MB using lists.

Less useful: For this specific submission, you can also AC by submitting in C++17, or deleting all deques of size 1 from the map.

Also, your Node has memory leaks. You don't delete l and r. Doesn't fix the issue here though.

It's from kactl, I didn't bother.

-is-this-fft-

+13

Implementing a deque yourself using a circular buffer probably wouldn't be that hard either.

Rough idea: maintain a buffer of length $$$2^k$$$ as well as a pointer to the first element and the length of the deque. Accessing the $$$i$$$-th element is then done as buf[(i + first) % 1 << k]; pop_back is length--, pop_front is first++; first %= 1 << k;. To push_back and push_front, you can normally write the data to the correct location, but if the length of the deque would exceed $$$2^k$$$, migrate to a buffer of length $$$2^{k + 1}$$$ before (like vectors do).

Mindeveloped

← Rev. 2 →

+45

Reminds me of ...

Defining $$$10^6$$$ std::deque in 512MB ML would definitely lead to MLE.

A famous sentence in Chinese programming community.

+81

can you share some other famous sentences in Chinese programming community?

dsost

About the queue-optimized Bellman-Ford algorithm (known as "SPFA" in Chinese programming community):It's dead.

sleepntsheep

+14

Please share more Chinese wisdom

Aarush's blog