有没什么开源的线程安全内存池更快的库？

水木社区手机版

主题:有没什么开源的线程安全内存池更快的库？
楼主|Algoquant|2025-01-09 19:00:58|只看此ID
#include <boost/pool/object_pool.hpp>
#include <boost/pool/singleton_pool.hpp>
#include <spdlog/spdlog.h>
#include <thread>

struct TimeCost {

    TimeCost(const std::string& _title, const std::string& describe = "", bool defaultP = true) :
        title(_title), start(std::chrono::steady_clock::now()), end(start) {
    }
    TimeCost(const TimeCost& rhs) : title(rhs.title), start(rhs.start), end(rhs.end) {};
    ~TimeCost() {
        print();

    }
    void print(const std::string& _text = "") {
        if (end == start) {
            end = std::chrono::steady_clock::now();
        }
        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
        auto duration2 = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
        SPDLOG_INFO("TimeCost:{},cost:{}ms", title, duration.count());
    }

    std::string title;
    std::chrono::steady_clock::time_point start{ std::chrono::steady_clock::time_point::min() };
    std::chrono::steady_clock::time_point end  { std::chrono::steady_clock::time_point::min() };
};

template <typename T>
class Benchmark {
public:

    template <typename... Args>
    inline static T* construct(Args&&... args) {
        return new T(std::forward<Args>(args)...);
    }
    inline static void destroy(T* ptr) {
        delete ptr;
    }

};

template <typename T, std::size_t N = 2000>
class ObjectFactory
{

private:
    struct SharedDeleter
    {
        ObjectFactory<T>* m_pFact;

        SharedDeleter(ObjectFactory<T>* fact) : m_pFact(fact) {}

        inline void operator()(T* p) const
        {
            m_pFact->destroy(p);
        }
    };

    static thread_local boost::object_pool<T>* m_tlsPool;
    SharedDeleter           m_Deleter;

public:

    ObjectFactory() : m_Deleter(this)
    {
        SPDLOG_INFO("ObjectFactory init");
    }

    virtual ~ObjectFactory()
    {
        if (m_tlsPool)
            delete m_tlsPool;
    }

    template<typename TType = T, typename... TArgs>
    inline static TType* construct(TArgs&&... mArgs)
    {
        return m_tlsPool->construct(mArgs...);
    }

    inline static void destroy(T* mObj)
    {
        if (!mObj)
            return;
        m_tlsPool->destroy(mObj);
    }

    template<typename TType = T, typename... TArgs>
    inline std::shared_ptr<TType> make_shared(TArgs&&... mArgs)
    {
        return std::shared_ptr<TType>(this->construct(mArgs...), m_Deleter);
    }
};

template<typename T, std::size_t N>
/*static*/thread_local boost::object_pool<T>* ObjectFactory<T, N>::m_tlsPool = new boost::object_pool<T>();

template <typename T, std::size_t N = 2000>
class ObjectFactoryThreadSafe
{
    typedef boost::singleton_pool<T, N>    SigPool;

private:
    struct SharedDeleter
    {
        SharedDeleter() {}
        inline void operator()(T* p) const
        {
            SigPool::destroy(p);
        }
    };

    inline static SharedDeleter  m_Deleter{};

public:

    ObjectFactoryThreadSafe() = default;
    virtual ~ObjectFactoryThreadSafe() {}

    template<typename TType = T, typename... TArgs>
    inline static TType* construct(TArgs&&... mArgs)
    {
        auto ptr = static_cast<TType*>(SigPool::malloc());
        new(ptr) TType(mArgs...);
        return ptr;
    }

    inline static void destroy(T* mObj)
    {
        if (!mObj)
            return;
        mObj->~T();
        SigPool::free(mObj);
    }

    template<typename TType = T, typename... TArgs>
    inline std::shared_ptr<TType> make_shared(TArgs&&... mArgs)
    {
        return std::shared_ptr<TType>(this->construct(mArgs...), m_Deleter);
    }
};

struct MyClass {
    std::string name;
    int id;
    double score{ 0 };
    double age    { 0 };
    double weight { 0 };

    MyClass(const std::string& name, int id) : id(id), name(name) {
        //std::cout << "MyClass constructed: " << id << ", " << name << std::endl;
    }
};

void threadlocal(std::size_t N = 100000) {
    TimeCost t("threadlocal");
    using TestPool = ObjectFactory<MyClass>;

    for (auto i = 0; i < N; i++) {
        auto m = TestPool::construct("threadlocal", i);
        m->weight = i * 100;
        m->age = i * 10;
        m->score = i * 0.1;
        TestPool::destroy(m);
    }
}
void benchmark(std::size_t N = 100000) {
    TimeCost t("benchmark");
    using TestPool = Benchmark<MyClass>;
    for (auto i = 0; i < N; i++) {
        auto m = TestPool::construct("benchmark", i);
        m->weight = i * 100;
        m->age = i * 10;
        m->score = i * 0.1;
        TestPool::destroy(m);
    }
}
void threadsafe(std::size_t N = 100000) {
    TimeCost t("threadsafe");
    using TestPool = ObjectFactoryThreadSafe<MyClass>;

    for (auto i = 0; i < N; i++) {
        auto m = TestPool::construct("threadsafe", i);
        m->weight = i * 100;
        m->age = i * 10;
        m->score = i * 0.1;
        TestPool::destroy(m);
    }
}

int main() {

    spdlog::set_pattern("[%Y-%m-%d %H:%M:%S.%e][%P:%5t][%^%l%$][%s:%#]%v");
    spdlog::flush_on(spdlog::level::info);
    spdlog::flush_every(std::chrono::seconds(5));
    auto numThreads = 5;
    auto callTimes = 1000000;
    std::vector<std::thread> threads;

    for (int i = 0; i < numThreads; ++i) {

        threads.push_back(std::thread([callTimes]() { benchmark(callTimes); }));
    }
    for (int i = 0; i < numThreads; ++i) {

        threads.push_back(std::thread([callTimes]() { threadlocal(callTimes); }));
    }
    for (int i = 0; i < numThreads; ++i) {

        threads.push_back(std::thread([callTimes]() { threadsafe(callTimes); }));
    }
    for (auto& t : threads) {
        t.join();
    }

    auto case1 = getchar();

    return 0;
}
--
FROM 14.154.27.*
1楼|Algoquant|2025-01-09 19:06:09|只看此ID
简单测试了一下， boost::object_pool 在thread_local 时比 new/delete 快不少，但不是线程安全的，不能跨线程，一旦指针被复制post到其他线程，就废了，用 boost::singleton_pool 是全局的可以线程安全，但速度慢了很多，上述三个函数速度大概是 45 vs 100 vs 400的级别。
有没人知道或用过哪些更好的实现的

[2025-01-09 19:10:41.099][5264:30972][info][memoryPoolTest.cpp:23]TimeCost:threadlocal,cost:25ms
[2025-01-09 19:10:41.101][5264:38744][info][memoryPoolTest.cpp:23]TimeCost:threadlocal,cost:27ms
[2025-01-09 19:10:41.122][5264:41876][info][memoryPoolTest.cpp:23]TimeCost:threadlocal,cost:49ms
[2025-01-09 19:10:41.131][5264:40668][info][memoryPoolTest.cpp:23]TimeCost:threadlocal,cost:57ms
[2025-01-09 19:10:41.145][5264:43864][info][memoryPoolTest.cpp:23]TimeCost:threadlocal,cost:71ms
[2025-01-09 19:10:41.162][5264:21132][info][memoryPoolTest.cpp:23]TimeCost:benchmark,cost:89ms
[2025-01-09 19:10:41.168][5264:24156][info][memoryPoolTest.cpp:23]TimeCost:benchmark,cost:95ms
[2025-01-09 19:10:41.170][5264:23256][info][memoryPoolTest.cpp:23]TimeCost:benchmark,cost:97ms
[2025-01-09 19:10:41.175][5264:47936][info][memoryPoolTest.cpp:23]TimeCost:benchmark,cost:102ms
[2025-01-09 19:10:41.180][5264:23776][info][memoryPoolTest.cpp:23]TimeCost:benchmark,cost:107ms
[2025-01-09 19:10:41.460][5264:30264][info][memoryPoolTest.cpp:23]TimeCost:threadsafe,cost:386ms
[2025-01-09 19:10:41.471][5264:18472][info][memoryPoolTest.cpp:23]TimeCost:threadsafe,cost:397ms
[2025-01-09 19:10:41.476][5264:46956][info][memoryPoolTest.cpp:23]TimeCost:threadsafe,cost:401ms
[2025-01-09 19:10:41.483][5264:13004][info][memoryPoolTest.cpp:23]TimeCost:threadsafe,cost:409ms
[2025-01-09 19:10:41.489][5264:17116][info][memoryPoolTest.cpp:23]TimeCost:threadsafe,cost:416ms
--
修改:Algoquant FROM 14.154.27.*
FROM 14.154.27.*
2楼|Algoquant|2025-01-09 19:27:04|只看此ID
如果线程numThreads 设为1的话， boost::singleton_pool 也并不慢，也就是说如果线程竞争的概率很低的话 boost::singleton_pool 速度也还可以。

[2025-01-09 19:23:29.098][44384:15440][info][memoryPoolTest.cpp:23]TimeCost:threadlocal,cost:16ms
[2025-01-09 19:23:29.132][44384: 3324][info][memoryPoolTest.cpp:23]TimeCost:threadsafe,cost:49ms
[2025-01-09 19:23:29.137][44384: 6444][info][memoryPoolTest.cpp:23]TimeCost:benchmark,cost:55ms

【在 Algoquant 的大作中提到: 】
: 简单测试了一下， boost::object_pool 在thread_local 时比 new/delete 快不少，但不是线程安全的，不能跨线程，一旦指针被复制post到其他线程，就废了，用 boost::singleton_pool 是全局的可以线程安全，但速度慢了很多，上述三个函数速度大概是 45 vs 100 vs 400的级别。
: 有没人知道或用过哪些更好的实现的
:
: ...................
--
修改:Algoquant FROM 14.154.27.*
FROM 14.154.27.*
3楼|ziqin|2025-01-09 20:06:58|只看此ID
你需要改构架
--
FROM 183.128.165.*
4楼|Algoquant|2025-01-09 21:12:10|只看此ID
啥意思，不要使用内存池？还是说保证 thread_local 的东西不要跨线程处理？
【在 ziqin 的大作中提到: 】
: 你需要改构架
--
FROM 14.154.27.*
5楼|ziqin|2025-01-09 21:36:48|只看此ID
程序的性能和框架的灵活性永远有一个平衡点，不可能即又

我的理解的是，你现在的框架是一个任何一个worker thread都可以construct object，异步工作以后，任何一个worker thread都可以destruct object的框架，这个框架非常universal，非常棒，但是对不起，OS本身的new/delete就是为这个框架设计的，除非你能改kernel层面的东西，在user层面，你要这么universal那new/delete就是最优解

除非你在框架灵活上做出让步，不然光谈性能没有意义

【在 Algoquant 的大作中提到: 】
: 啥意思，不要使用内存池？还是说保证 thread_local 的东西不要跨线程处理？
--
FROM 183.128.165.*
6楼|ziqin|2025-01-09 21:53:48|只看此ID
你不需要一个user层面的pool，你需要一个kernel层面更快的new/delete，查一下TCMalloc
--
FROM 183.128.165.*
7楼|Algoquant|2025-01-09 22:36:04|只看此ID
我原先也是觉得内核实现 new/delete 已经是多线程安全，肯定是极致的了，所以好奇有没真的更快的实现，而且很多所谓的内存碎片好像C标准库的实现也是考虑了。我现在尝试搜到github上的线程安全的memory pool
都没跑过 new / delete。

【在 ziqin 的大作中提到: 】
: 程序的性能和框架的灵活性永远有一个平衡点，不可能即又
: 我的理解的是，你现在的框架是一个任何一个worker thread都可以construct object，异步工作以后，任何一个worker thread都可以destruct object的框架，这个框架非常universal，非常棒，但是对不起，OS本身的new/delete就是为这个框架设计的，除非你能改kernel层面的东西，在user层面，你要这么universal那new/delete就是最优解
: 除非你在框架灵活上做出让步，不然光谈性能没有意义
: ...................
--
FROM 14.154.27.*
8楼|Algoquant|2025-01-09 23:48:37|只看此ID
windows 上用了jemalloc，提升很明显，还是上面的例子，耗时 jemalloc vs new/delete 大概 1:4
【在 ziqin 的大作中提到: 】
: 你不需要一个user层面的pool，你需要一个kernel层面更快的new/delete，查一下TCMalloc
--
FROM 14.154.27.*
9楼|ylh1969|2025-01-12 19:13:17|只看此ID
tcmalloc。好用，线程并发环境使用效果显著。
百度一下，似乎有反映内存泄漏并提供解决方法。
但是我在纯C应用中没有发现此类问题，似乎与C++的异常处理和析构有关。
【在 Algoquant 的大作中提到: 】
: #include <boost/pool/object_pool.hpp>
: #include <boost/pool/singleton_pool.hpp>
: #include <spdlog/spdlog.h>
: ...................
--
修改:ylh1969 FROM 221.218.60.*
FROM 221.218.60.*