Parallel memcpy in cpp












0















I am trying to copy a matrix in parallel. Below is the code that I am working with. Currently, it works as expected with char, but it seg faults when I use shorts. I assume that the bug is in copying outside of the memory outside of the vector. I have tried to debug my assumption without success.



CMakeLists.txt



cmake_minimum_required(VERSION 3.0)
project(memcpy CXX)
find_package (Threads)
add_executable(memcpy main.cpp)
set_property(TARGET memcpy PROPERTY CXX_STANDARD 17)
target_link_libraries (memcpy ${CMAKE_THREAD_LIBS_INIT})


main.cpp



#include <cassert>
#include <condition_variable>
#include <cstring>
#include <iostream>
#include <mutex>
#include <string>
#include <thread>
#include <vector>


class Barrier {
public:
explicit Barrier(std::size_t const count) : m_threshold(count), m_remaining(count), m_generation(0) {}

void wait() {
auto local = std::unique_lock<std::mutex>{m_mutex};
auto current_generation = m_generation;

m_remaining--;
if (!m_remaining) {
m_generation++;
m_remaining = m_threshold;
m_condition.notify_all();
} else {
m_condition.wait(local, [this, current_generation] { return current_generation != m_generation; });
}
}

private:
std::mutex m_mutex;
std::condition_variable m_condition;
std::size_t m_threshold;
std::size_t m_remaining;
std::size_t m_generation;
};


template <typename T>
class Matrix {
using reference = typename std::vector<T>::reference;
using const_reference = typename std::vector<T>::const_reference;

public:
Matrix(std::size_t rows, std::size_t cols) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows) {}
Matrix(std::size_t rows, std::size_t cols, T const& default_val) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows, default_val) {}

constexpr std::size_t get_columns() const { return m_cols; }
constexpr std::size_t get_rows() const { return m_rows; }
constexpr std::size_t get_element_count() const {
assert(m_cols * m_rows == m_data.size());
return m_cols * m_rows;
}

T* data() { return m_data.data(); }
T const* data() const { return m_data.data(); }

reference operator()(std::size_t const column_x, std::size_t const row_y) {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());

return m_data[row_y * m_cols + column_x];
}

const_reference operator()(std::size_t const column_x, std::size_t const row_y) const {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());

return m_data[row_y * m_cols + column_x];
}

private:
std::size_t const m_rows;
std::size_t const m_cols;
std::vector<T> m_data;
};


static_assert(false, "FIX ME");
using T = char;
// using T = short;
// using T = int;
// using T = double;


void run(std::size_t const my_rank, std::size_t const num_threads, Barrier& barrier, Matrix<T> const& from_data, Matrix<T>& to_data) {
auto n = from_data.get_element_count();
std::string str;

if (my_rank == 0) {
std::cerr << "bytes to copy: " << (n * sizeof(T)) << 'n';
}

// initialization
std::size_t segment_size = n / num_threads;
std::size_t start = (my_rank * segment_size) * sizeof(T);
std::size_t end = ((my_rank + 1) * segment_size) * sizeof(T);
std::size_t distance = end - start;


str += " my_rank: " + std::to_string(my_rank);
str += " segment_size: " + std::to_string(segment_size);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " rank: " + std::to_string(my_rank);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " e: " + std::to_string(start + distance);
str += "n";
std::cerr << str;

barrier.wait();
std::memcpy(to_data.data() + start, from_data.data() + start, distance);
barrier.wait();


if (my_rank == 0)
for (auto y = 0; y < from_data.get_rows(); y++) {
for (auto x = 0; x < from_data.get_columns(); x++) {
if (to_data(x, y) != from_data(x, y)) {
std::cerr << "x: " << x << 't' << "y: " << y << "tt";
std::cerr << "to: " << to_data(x, y) << 't' << "from: " << from_data(x, y) << 'n';
}
}
}

barrier.wait();
}


int main() {
auto const num_threads = 1;
// auto const num_threads = 4;

// auto const width = 64;
// auto const height = 64;
auto const width = 97;
auto const height = 101;

auto from_data = Matrix<T>(width, height, 70);
auto to_data = Matrix<T>(width, height, 84);

std::vector<std::thread> threads;
auto barrier = Barrier{num_threads};
for (auto i = 0; i < num_threads; i++) {
threads.emplace_back(run, i, num_threads, std::ref(barrier), std::ref(from_data), std::ref(to_data));
}

for (auto& thread : threads) {
thread.join();
}
}









share|improve this question

























  • I doubt you will get a significant performance improvement from threads if your goal is just to duplicate data in memory... maybe if you use a number of threads equal to the number of memory channels but even so, CPU clocks are already about the double the memory's, which makes it four times faster considering you have to read and then write back, the memory bus is the bottleneck here, not the CPU.

    – Havenard
    Nov 15 '18 at 4:14






  • 1





    auto from_data = Matrix<std::string>(width, height, 70); -- Your code is instantly broken. If you had considered things like this, you would have never used memcpy. Never use std::memcpy if there is a chance that the thing you're copying could be non-POD. Compilers these days are smart enough to choose what type of copy to use when you use std::copy instead (either memcpy, a loop, etc.).

    – PaulMcKenzie
    Nov 15 '18 at 4:32













  • @Havenard The threads already exist at this point in the code and would have nothing to do but wait around for the memcpy to finish. (it's also for a university assignment)

    – Brandon
    Nov 15 '18 at 4:33
















0















I am trying to copy a matrix in parallel. Below is the code that I am working with. Currently, it works as expected with char, but it seg faults when I use shorts. I assume that the bug is in copying outside of the memory outside of the vector. I have tried to debug my assumption without success.



CMakeLists.txt



cmake_minimum_required(VERSION 3.0)
project(memcpy CXX)
find_package (Threads)
add_executable(memcpy main.cpp)
set_property(TARGET memcpy PROPERTY CXX_STANDARD 17)
target_link_libraries (memcpy ${CMAKE_THREAD_LIBS_INIT})


main.cpp



#include <cassert>
#include <condition_variable>
#include <cstring>
#include <iostream>
#include <mutex>
#include <string>
#include <thread>
#include <vector>


class Barrier {
public:
explicit Barrier(std::size_t const count) : m_threshold(count), m_remaining(count), m_generation(0) {}

void wait() {
auto local = std::unique_lock<std::mutex>{m_mutex};
auto current_generation = m_generation;

m_remaining--;
if (!m_remaining) {
m_generation++;
m_remaining = m_threshold;
m_condition.notify_all();
} else {
m_condition.wait(local, [this, current_generation] { return current_generation != m_generation; });
}
}

private:
std::mutex m_mutex;
std::condition_variable m_condition;
std::size_t m_threshold;
std::size_t m_remaining;
std::size_t m_generation;
};


template <typename T>
class Matrix {
using reference = typename std::vector<T>::reference;
using const_reference = typename std::vector<T>::const_reference;

public:
Matrix(std::size_t rows, std::size_t cols) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows) {}
Matrix(std::size_t rows, std::size_t cols, T const& default_val) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows, default_val) {}

constexpr std::size_t get_columns() const { return m_cols; }
constexpr std::size_t get_rows() const { return m_rows; }
constexpr std::size_t get_element_count() const {
assert(m_cols * m_rows == m_data.size());
return m_cols * m_rows;
}

T* data() { return m_data.data(); }
T const* data() const { return m_data.data(); }

reference operator()(std::size_t const column_x, std::size_t const row_y) {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());

return m_data[row_y * m_cols + column_x];
}

const_reference operator()(std::size_t const column_x, std::size_t const row_y) const {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());

return m_data[row_y * m_cols + column_x];
}

private:
std::size_t const m_rows;
std::size_t const m_cols;
std::vector<T> m_data;
};


static_assert(false, "FIX ME");
using T = char;
// using T = short;
// using T = int;
// using T = double;


void run(std::size_t const my_rank, std::size_t const num_threads, Barrier& barrier, Matrix<T> const& from_data, Matrix<T>& to_data) {
auto n = from_data.get_element_count();
std::string str;

if (my_rank == 0) {
std::cerr << "bytes to copy: " << (n * sizeof(T)) << 'n';
}

// initialization
std::size_t segment_size = n / num_threads;
std::size_t start = (my_rank * segment_size) * sizeof(T);
std::size_t end = ((my_rank + 1) * segment_size) * sizeof(T);
std::size_t distance = end - start;


str += " my_rank: " + std::to_string(my_rank);
str += " segment_size: " + std::to_string(segment_size);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " rank: " + std::to_string(my_rank);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " e: " + std::to_string(start + distance);
str += "n";
std::cerr << str;

barrier.wait();
std::memcpy(to_data.data() + start, from_data.data() + start, distance);
barrier.wait();


if (my_rank == 0)
for (auto y = 0; y < from_data.get_rows(); y++) {
for (auto x = 0; x < from_data.get_columns(); x++) {
if (to_data(x, y) != from_data(x, y)) {
std::cerr << "x: " << x << 't' << "y: " << y << "tt";
std::cerr << "to: " << to_data(x, y) << 't' << "from: " << from_data(x, y) << 'n';
}
}
}

barrier.wait();
}


int main() {
auto const num_threads = 1;
// auto const num_threads = 4;

// auto const width = 64;
// auto const height = 64;
auto const width = 97;
auto const height = 101;

auto from_data = Matrix<T>(width, height, 70);
auto to_data = Matrix<T>(width, height, 84);

std::vector<std::thread> threads;
auto barrier = Barrier{num_threads};
for (auto i = 0; i < num_threads; i++) {
threads.emplace_back(run, i, num_threads, std::ref(barrier), std::ref(from_data), std::ref(to_data));
}

for (auto& thread : threads) {
thread.join();
}
}









share|improve this question

























  • I doubt you will get a significant performance improvement from threads if your goal is just to duplicate data in memory... maybe if you use a number of threads equal to the number of memory channels but even so, CPU clocks are already about the double the memory's, which makes it four times faster considering you have to read and then write back, the memory bus is the bottleneck here, not the CPU.

    – Havenard
    Nov 15 '18 at 4:14






  • 1





    auto from_data = Matrix<std::string>(width, height, 70); -- Your code is instantly broken. If you had considered things like this, you would have never used memcpy. Never use std::memcpy if there is a chance that the thing you're copying could be non-POD. Compilers these days are smart enough to choose what type of copy to use when you use std::copy instead (either memcpy, a loop, etc.).

    – PaulMcKenzie
    Nov 15 '18 at 4:32













  • @Havenard The threads already exist at this point in the code and would have nothing to do but wait around for the memcpy to finish. (it's also for a university assignment)

    – Brandon
    Nov 15 '18 at 4:33














0












0








0








I am trying to copy a matrix in parallel. Below is the code that I am working with. Currently, it works as expected with char, but it seg faults when I use shorts. I assume that the bug is in copying outside of the memory outside of the vector. I have tried to debug my assumption without success.



CMakeLists.txt



cmake_minimum_required(VERSION 3.0)
project(memcpy CXX)
find_package (Threads)
add_executable(memcpy main.cpp)
set_property(TARGET memcpy PROPERTY CXX_STANDARD 17)
target_link_libraries (memcpy ${CMAKE_THREAD_LIBS_INIT})


main.cpp



#include <cassert>
#include <condition_variable>
#include <cstring>
#include <iostream>
#include <mutex>
#include <string>
#include <thread>
#include <vector>


class Barrier {
public:
explicit Barrier(std::size_t const count) : m_threshold(count), m_remaining(count), m_generation(0) {}

void wait() {
auto local = std::unique_lock<std::mutex>{m_mutex};
auto current_generation = m_generation;

m_remaining--;
if (!m_remaining) {
m_generation++;
m_remaining = m_threshold;
m_condition.notify_all();
} else {
m_condition.wait(local, [this, current_generation] { return current_generation != m_generation; });
}
}

private:
std::mutex m_mutex;
std::condition_variable m_condition;
std::size_t m_threshold;
std::size_t m_remaining;
std::size_t m_generation;
};


template <typename T>
class Matrix {
using reference = typename std::vector<T>::reference;
using const_reference = typename std::vector<T>::const_reference;

public:
Matrix(std::size_t rows, std::size_t cols) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows) {}
Matrix(std::size_t rows, std::size_t cols, T const& default_val) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows, default_val) {}

constexpr std::size_t get_columns() const { return m_cols; }
constexpr std::size_t get_rows() const { return m_rows; }
constexpr std::size_t get_element_count() const {
assert(m_cols * m_rows == m_data.size());
return m_cols * m_rows;
}

T* data() { return m_data.data(); }
T const* data() const { return m_data.data(); }

reference operator()(std::size_t const column_x, std::size_t const row_y) {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());

return m_data[row_y * m_cols + column_x];
}

const_reference operator()(std::size_t const column_x, std::size_t const row_y) const {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());

return m_data[row_y * m_cols + column_x];
}

private:
std::size_t const m_rows;
std::size_t const m_cols;
std::vector<T> m_data;
};


static_assert(false, "FIX ME");
using T = char;
// using T = short;
// using T = int;
// using T = double;


void run(std::size_t const my_rank, std::size_t const num_threads, Barrier& barrier, Matrix<T> const& from_data, Matrix<T>& to_data) {
auto n = from_data.get_element_count();
std::string str;

if (my_rank == 0) {
std::cerr << "bytes to copy: " << (n * sizeof(T)) << 'n';
}

// initialization
std::size_t segment_size = n / num_threads;
std::size_t start = (my_rank * segment_size) * sizeof(T);
std::size_t end = ((my_rank + 1) * segment_size) * sizeof(T);
std::size_t distance = end - start;


str += " my_rank: " + std::to_string(my_rank);
str += " segment_size: " + std::to_string(segment_size);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " rank: " + std::to_string(my_rank);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " e: " + std::to_string(start + distance);
str += "n";
std::cerr << str;

barrier.wait();
std::memcpy(to_data.data() + start, from_data.data() + start, distance);
barrier.wait();


if (my_rank == 0)
for (auto y = 0; y < from_data.get_rows(); y++) {
for (auto x = 0; x < from_data.get_columns(); x++) {
if (to_data(x, y) != from_data(x, y)) {
std::cerr << "x: " << x << 't' << "y: " << y << "tt";
std::cerr << "to: " << to_data(x, y) << 't' << "from: " << from_data(x, y) << 'n';
}
}
}

barrier.wait();
}


int main() {
auto const num_threads = 1;
// auto const num_threads = 4;

// auto const width = 64;
// auto const height = 64;
auto const width = 97;
auto const height = 101;

auto from_data = Matrix<T>(width, height, 70);
auto to_data = Matrix<T>(width, height, 84);

std::vector<std::thread> threads;
auto barrier = Barrier{num_threads};
for (auto i = 0; i < num_threads; i++) {
threads.emplace_back(run, i, num_threads, std::ref(barrier), std::ref(from_data), std::ref(to_data));
}

for (auto& thread : threads) {
thread.join();
}
}









share|improve this question
















I am trying to copy a matrix in parallel. Below is the code that I am working with. Currently, it works as expected with char, but it seg faults when I use shorts. I assume that the bug is in copying outside of the memory outside of the vector. I have tried to debug my assumption without success.



CMakeLists.txt



cmake_minimum_required(VERSION 3.0)
project(memcpy CXX)
find_package (Threads)
add_executable(memcpy main.cpp)
set_property(TARGET memcpy PROPERTY CXX_STANDARD 17)
target_link_libraries (memcpy ${CMAKE_THREAD_LIBS_INIT})


main.cpp



#include <cassert>
#include <condition_variable>
#include <cstring>
#include <iostream>
#include <mutex>
#include <string>
#include <thread>
#include <vector>


class Barrier {
public:
explicit Barrier(std::size_t const count) : m_threshold(count), m_remaining(count), m_generation(0) {}

void wait() {
auto local = std::unique_lock<std::mutex>{m_mutex};
auto current_generation = m_generation;

m_remaining--;
if (!m_remaining) {
m_generation++;
m_remaining = m_threshold;
m_condition.notify_all();
} else {
m_condition.wait(local, [this, current_generation] { return current_generation != m_generation; });
}
}

private:
std::mutex m_mutex;
std::condition_variable m_condition;
std::size_t m_threshold;
std::size_t m_remaining;
std::size_t m_generation;
};


template <typename T>
class Matrix {
using reference = typename std::vector<T>::reference;
using const_reference = typename std::vector<T>::const_reference;

public:
Matrix(std::size_t rows, std::size_t cols) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows) {}
Matrix(std::size_t rows, std::size_t cols, T const& default_val) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows, default_val) {}

constexpr std::size_t get_columns() const { return m_cols; }
constexpr std::size_t get_rows() const { return m_rows; }
constexpr std::size_t get_element_count() const {
assert(m_cols * m_rows == m_data.size());
return m_cols * m_rows;
}

T* data() { return m_data.data(); }
T const* data() const { return m_data.data(); }

reference operator()(std::size_t const column_x, std::size_t const row_y) {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());

return m_data[row_y * m_cols + column_x];
}

const_reference operator()(std::size_t const column_x, std::size_t const row_y) const {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());

return m_data[row_y * m_cols + column_x];
}

private:
std::size_t const m_rows;
std::size_t const m_cols;
std::vector<T> m_data;
};


static_assert(false, "FIX ME");
using T = char;
// using T = short;
// using T = int;
// using T = double;


void run(std::size_t const my_rank, std::size_t const num_threads, Barrier& barrier, Matrix<T> const& from_data, Matrix<T>& to_data) {
auto n = from_data.get_element_count();
std::string str;

if (my_rank == 0) {
std::cerr << "bytes to copy: " << (n * sizeof(T)) << 'n';
}

// initialization
std::size_t segment_size = n / num_threads;
std::size_t start = (my_rank * segment_size) * sizeof(T);
std::size_t end = ((my_rank + 1) * segment_size) * sizeof(T);
std::size_t distance = end - start;


str += " my_rank: " + std::to_string(my_rank);
str += " segment_size: " + std::to_string(segment_size);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " rank: " + std::to_string(my_rank);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " e: " + std::to_string(start + distance);
str += "n";
std::cerr << str;

barrier.wait();
std::memcpy(to_data.data() + start, from_data.data() + start, distance);
barrier.wait();


if (my_rank == 0)
for (auto y = 0; y < from_data.get_rows(); y++) {
for (auto x = 0; x < from_data.get_columns(); x++) {
if (to_data(x, y) != from_data(x, y)) {
std::cerr << "x: " << x << 't' << "y: " << y << "tt";
std::cerr << "to: " << to_data(x, y) << 't' << "from: " << from_data(x, y) << 'n';
}
}
}

barrier.wait();
}


int main() {
auto const num_threads = 1;
// auto const num_threads = 4;

// auto const width = 64;
// auto const height = 64;
auto const width = 97;
auto const height = 101;

auto from_data = Matrix<T>(width, height, 70);
auto to_data = Matrix<T>(width, height, 84);

std::vector<std::thread> threads;
auto barrier = Barrier{num_threads};
for (auto i = 0; i < num_threads; i++) {
threads.emplace_back(run, i, num_threads, std::ref(barrier), std::ref(from_data), std::ref(to_data));
}

for (auto& thread : threads) {
thread.join();
}
}






c++ multithreading pointers c++14 memcpy






share|improve this question















share|improve this question













share|improve this question




share|improve this question








edited Nov 15 '18 at 11:59







Brandon

















asked Nov 15 '18 at 3:53









BrandonBrandon

1931312




1931312













  • I doubt you will get a significant performance improvement from threads if your goal is just to duplicate data in memory... maybe if you use a number of threads equal to the number of memory channels but even so, CPU clocks are already about the double the memory's, which makes it four times faster considering you have to read and then write back, the memory bus is the bottleneck here, not the CPU.

    – Havenard
    Nov 15 '18 at 4:14






  • 1





    auto from_data = Matrix<std::string>(width, height, 70); -- Your code is instantly broken. If you had considered things like this, you would have never used memcpy. Never use std::memcpy if there is a chance that the thing you're copying could be non-POD. Compilers these days are smart enough to choose what type of copy to use when you use std::copy instead (either memcpy, a loop, etc.).

    – PaulMcKenzie
    Nov 15 '18 at 4:32













  • @Havenard The threads already exist at this point in the code and would have nothing to do but wait around for the memcpy to finish. (it's also for a university assignment)

    – Brandon
    Nov 15 '18 at 4:33



















  • I doubt you will get a significant performance improvement from threads if your goal is just to duplicate data in memory... maybe if you use a number of threads equal to the number of memory channels but even so, CPU clocks are already about the double the memory's, which makes it four times faster considering you have to read and then write back, the memory bus is the bottleneck here, not the CPU.

    – Havenard
    Nov 15 '18 at 4:14






  • 1





    auto from_data = Matrix<std::string>(width, height, 70); -- Your code is instantly broken. If you had considered things like this, you would have never used memcpy. Never use std::memcpy if there is a chance that the thing you're copying could be non-POD. Compilers these days are smart enough to choose what type of copy to use when you use std::copy instead (either memcpy, a loop, etc.).

    – PaulMcKenzie
    Nov 15 '18 at 4:32













  • @Havenard The threads already exist at this point in the code and would have nothing to do but wait around for the memcpy to finish. (it's also for a university assignment)

    – Brandon
    Nov 15 '18 at 4:33

















I doubt you will get a significant performance improvement from threads if your goal is just to duplicate data in memory... maybe if you use a number of threads equal to the number of memory channels but even so, CPU clocks are already about the double the memory's, which makes it four times faster considering you have to read and then write back, the memory bus is the bottleneck here, not the CPU.

– Havenard
Nov 15 '18 at 4:14





I doubt you will get a significant performance improvement from threads if your goal is just to duplicate data in memory... maybe if you use a number of threads equal to the number of memory channels but even so, CPU clocks are already about the double the memory's, which makes it four times faster considering you have to read and then write back, the memory bus is the bottleneck here, not the CPU.

– Havenard
Nov 15 '18 at 4:14




1




1





auto from_data = Matrix<std::string>(width, height, 70); -- Your code is instantly broken. If you had considered things like this, you would have never used memcpy. Never use std::memcpy if there is a chance that the thing you're copying could be non-POD. Compilers these days are smart enough to choose what type of copy to use when you use std::copy instead (either memcpy, a loop, etc.).

– PaulMcKenzie
Nov 15 '18 at 4:32







auto from_data = Matrix<std::string>(width, height, 70); -- Your code is instantly broken. If you had considered things like this, you would have never used memcpy. Never use std::memcpy if there is a chance that the thing you're copying could be non-POD. Compilers these days are smart enough to choose what type of copy to use when you use std::copy instead (either memcpy, a loop, etc.).

– PaulMcKenzie
Nov 15 '18 at 4:32















@Havenard The threads already exist at this point in the code and would have nothing to do but wait around for the memcpy to finish. (it's also for a university assignment)

– Brandon
Nov 15 '18 at 4:33





@Havenard The threads already exist at this point in the code and would have nothing to do but wait around for the memcpy to finish. (it's also for a university assignment)

– Brandon
Nov 15 '18 at 4:33












1 Answer
1






active

oldest

votes


















4















std::memcpy(to_data.data() + start, from_data.data() + start, distance)




std::vector<T>::data() returns a T* so if you add an integral value foo to it, you effectively add foo * sizeof T bytes ... but you allready multiplied with sizeof(T) earlier when calculating start and end. Also, std::memcpy() won't work for Ts that are not PODs.



Better use std::copy().






share|improve this answer



















  • 2





    Also, std::copy is written to be smart enough to fall back to std::memcpy or equivalent if the type that is detected is trivially copyable. So you're not losing anything by using std::copy.

    – PaulMcKenzie
    Nov 15 '18 at 4:40











Your Answer






StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});


}
});














draft saved

draft discarded


















StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53312199%2fparallel-memcpy-in-cpp%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown

























1 Answer
1






active

oldest

votes








1 Answer
1






active

oldest

votes









active

oldest

votes






active

oldest

votes









4















std::memcpy(to_data.data() + start, from_data.data() + start, distance)




std::vector<T>::data() returns a T* so if you add an integral value foo to it, you effectively add foo * sizeof T bytes ... but you allready multiplied with sizeof(T) earlier when calculating start and end. Also, std::memcpy() won't work for Ts that are not PODs.



Better use std::copy().






share|improve this answer



















  • 2





    Also, std::copy is written to be smart enough to fall back to std::memcpy or equivalent if the type that is detected is trivially copyable. So you're not losing anything by using std::copy.

    – PaulMcKenzie
    Nov 15 '18 at 4:40
















4















std::memcpy(to_data.data() + start, from_data.data() + start, distance)




std::vector<T>::data() returns a T* so if you add an integral value foo to it, you effectively add foo * sizeof T bytes ... but you allready multiplied with sizeof(T) earlier when calculating start and end. Also, std::memcpy() won't work for Ts that are not PODs.



Better use std::copy().






share|improve this answer



















  • 2





    Also, std::copy is written to be smart enough to fall back to std::memcpy or equivalent if the type that is detected is trivially copyable. So you're not losing anything by using std::copy.

    – PaulMcKenzie
    Nov 15 '18 at 4:40














4












4








4








std::memcpy(to_data.data() + start, from_data.data() + start, distance)




std::vector<T>::data() returns a T* so if you add an integral value foo to it, you effectively add foo * sizeof T bytes ... but you allready multiplied with sizeof(T) earlier when calculating start and end. Also, std::memcpy() won't work for Ts that are not PODs.



Better use std::copy().






share|improve this answer














std::memcpy(to_data.data() + start, from_data.data() + start, distance)




std::vector<T>::data() returns a T* so if you add an integral value foo to it, you effectively add foo * sizeof T bytes ... but you allready multiplied with sizeof(T) earlier when calculating start and end. Also, std::memcpy() won't work for Ts that are not PODs.



Better use std::copy().







share|improve this answer












share|improve this answer



share|improve this answer










answered Nov 15 '18 at 4:06









SwordfishSwordfish

9,88611436




9,88611436








  • 2





    Also, std::copy is written to be smart enough to fall back to std::memcpy or equivalent if the type that is detected is trivially copyable. So you're not losing anything by using std::copy.

    – PaulMcKenzie
    Nov 15 '18 at 4:40














  • 2





    Also, std::copy is written to be smart enough to fall back to std::memcpy or equivalent if the type that is detected is trivially copyable. So you're not losing anything by using std::copy.

    – PaulMcKenzie
    Nov 15 '18 at 4:40








2




2





Also, std::copy is written to be smart enough to fall back to std::memcpy or equivalent if the type that is detected is trivially copyable. So you're not losing anything by using std::copy.

– PaulMcKenzie
Nov 15 '18 at 4:40





Also, std::copy is written to be smart enough to fall back to std::memcpy or equivalent if the type that is detected is trivially copyable. So you're not losing anything by using std::copy.

– PaulMcKenzie
Nov 15 '18 at 4:40




















draft saved

draft discarded




















































Thanks for contributing an answer to Stack Overflow!


  • Please be sure to answer the question. Provide details and share your research!

But avoid



  • Asking for help, clarification, or responding to other answers.

  • Making statements based on opinion; back them up with references or personal experience.


To learn more, see our tips on writing great answers.




draft saved


draft discarded














StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53312199%2fparallel-memcpy-in-cpp%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown





















































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown

































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown







Popular posts from this blog

Xamarin.iOS Cant Deploy on Iphone

Glorious Revolution

Dulmage-Mendelsohn matrix decomposition in Python