linear algebra - Parallel saxpy implementation in Go isn't scaling well across cores -


so i'm trying implement implementation of saxpy both blocked , can computed in parallel using 8-cores available on machine. started assumption small sizes of vectors x , y fit l1 cache of machine (split 256kb - 128kb data, 128kb code), can computed in serial. test assumption, wrote 2 implementations of saxpy, 1 blocked serial version of saxpy (bss) , blocked parallel version of saxpy (bps). blocking algorithm used when sizes of vectors larger 4096 elements long. following implementations:

const cachecap = 32*1024/8 // 4096 func blocked_serial_saxpy(a float64, x []float64, incx int, b float64, y []float64, incy int, z []float64, incz int) {     zn := len(z)     //fmt.println("zn: ", zn)     if zn <= cachecap {         serial_saxpy(a, x, incx, b, y, incy, z, incz)         return     }      nblocks := zn/cachecap + 1     //fmt.println("nblocks: ", nblocks)     := 0; < nblocks; i++ {         beg := * cachecap         end := (i + 1) * cachecap         if end >= zn {             end = zn         }         //fmt.println("beg, end: ", beg, end)         xb := x[beg:end]         yb := y[beg:end]         zb := z[beg:end]         serial_saxpy(a, xb, incx, b, yb, incy, zb, incz)     } } func blocked_parallel_saxpy(a float64, x []float64, incx int, b float64, y []float64, incy int, z []float64, incz int) {     zn := len(z)     if zn <= cachecap {         //fmt.println("zn <= cachecap: using serial_saxpy")         serial_saxpy(a, x, incx, b, y, incy, z, incz)         return     }      nblocks := zn/cachecap + 1     //fmt.println("nblocks: ", nblocks)     nworkers := runtime.gomaxprocs(0)     if nblocks < nworkers {         nworkers = nblocks     }     //fmt.println("nworkers: ", nworkers)      //buf := blocksize*nworkers     //if buf > nblocks {     //  buf = nblocks     //}     //sendchan := make(chan block, buf)     sendchan := make(chan block, nblocks)      var wg sync.waitgroup     := 0; < nworkers; i++ {         wg.add(1)         go func() {             defer wg.done()             a, b := a, b             incx, incy, incz := incx, incy, incz             blk := range sendchan {                 beg, end := blk.beg, blk.end                 serial_saxpy(a, x[beg:end], incx, b, y[beg:end], incy, z[beg:end], incz)             }         }()     }      := 0; < nblocks; i++ {         beg := * cachecap         end := (i + 1) * cachecap         if end >= zn {             end = zn         }         //fmt.println("beg:end", beg, end)         sendchan <- block{beg, end}     }     close(sendchan)     wg.wait() }  func serial_saxpy(a float64, x []float64, incx int, b float64, y []float64, incy int, z []float64, incz int) {     if incx <= 0 || incy <= 0 || incz <= 0 {         panic("axpby: 0 or negative increments not supported")     }      n := len(z) / incz     if incx == 1 && incy == 1 && incz == 1 {         if == 1 && b == 1 {             := 0; < n; i++ {                 z[i] = x[i] + y[i]             }             return         }          if == 0 && b == 1 {             copy(z, y)             //for := 0; < n; i++ {             //  z[i] = y[i]             //}             return         }          if == 1 && b == 0 {             copy(z, x)             //for := 0; < n; i++ {             //  z[i] = x[i]             //}             return         }          if == 0 && b == 0 {             return         }          := 0; < n; i++ {             z[i] = a*x[i] + b*y[i]         }         return     }      // unequal increments or equal increments != 1     ix, iy, iz := 0, 0, 0     if == 1 && b == 1 {         := 0; < n; i, ix, iy, iz = i+1, ix+incx, iy+incy, iz+incz {             z[iz] = x[ix] + y[iy]         }         return     }      if == 0 && b == 1 {         := 0; < n; i, ix, iy, iz = i+1, ix+incx, iy+incy, iz+incz {             z[iz] = y[iy]         }         return     }      if == 1 && b == 0 {         := 0; < n; i, ix, iy, iz = i+1, ix+incx, iy+incy, iz+incz {             z[iz] = x[ix]         }         return     }      if == 0 && b == 0 {         return     }      := 0; < n; i, ix, iy, iz = i+1, ix+incx, iy+incy, iz+incz {         z[iz] = a*x[ix] + b*y[iy]     } } 

i wrote benchmarks 3 functions blocked_serial_saxpy, blocked_parallel_saxpy , serial_saxpy. following image shows results of benchmarks vector sizes 1e3, 1e4, 1e5, 2e5, 3e5, 4e5, 6e5, 8e5 , 1e6 respectively: saxpy benchmarks part 1saxpy benchmarks part 2

to me visualize performance of blocked_parallel_saxpy implementation, plotted results , obtained: saxpy performance plot looking @ plot, makes me wonder, why not seeing parallel speedup, when cpus being used , @ 100% during blocked_parallel_saxpy benchmark. image task manager below: saxpy cpu usage

could me understand what's going on here? i'm seeing, symptom of problem or way should be? if it's former, there way fix it?

edit: have modified blocked_parallel_saxpy code following. dividing total no.of blocks (nblocks) such there nworker goroutines computing nworker no. of blocks, in parallel. in addition, have removed channel. have benchmarked code , performs identically parallel implementation channel, hence why haven't attached benchmarks.

func blocked_parallel_saxpy(a float64, x []float64, incx int, b float64, y []float64, incy int, z []float64, incz int) {     zn := len(z)     if zn <= cachecap {         serial_saxpy(a, x, incx, b, y, incy, z, incz)         return     }      nblocks := zn/cachecap + 1     nworkers := runtime.gomaxprocs(0)     if nblocks < nworkers {         nworkers = nblocks     }      var wg sync.waitgroup     := 0; < nworkers; i++ {         j := 0; j < nworkers && (i+j) < nblocks; j++ {             wg.add(1)             go func(i, j int) {                 defer wg.done()                 a, b := a, b                 incx, incy, incz := incx, incy, incz                 k := + j                 beg := k * cachecap                 end := (k + 1) * cachecap                 if end >= zn {                     end = zn                 }                 serial_saxpy(a, x[beg:end], incx, b, y[beg:end], incy, z[beg:end], incz)             }(i, j)         }     wg.wait() } 

edit.2: have written version of blocked_parallel_saxpy, again, without channels. time, spawn numcpu goroutines, each processing nblocks/nworkers + 1 blocks each block cachecap no. of elements in length. even, here, code performs identically previous 2 implementations.

func blocked_parallel_saxpy(a float64, x []float64, incx int, b float64, y []float64, incy int, z []float64, incz int) {     zn := len(z)     if zn <= cachecap {         serial_saxpy(a, x, incx, b, y, incy, z, incz)         return     }      nblocks := zn/cachecap + 1     nworkers := runtime.gomaxprocs(runtime.numcpu())     if nblocks < nworkers {         nworkers = nblocks     }      k := nblocks/nworkers + 1     var wg sync.waitgroup     wg.add(nworkers)     := 0; < nworkers; i++ {         go func(i int) {             defer wg.done()             j := 0; j < k && (j+i*k) < nblocks; j++ {                 beg := (j + i*k) * cachecap                 end := beg + cachecap                 if end > zn {                     end = zn                 }                 //fmt.printf("i:%d, j:%d, k:%d, [beg:end]=[%d:%d]\n", i, j, k, beg, end)                 serial_saxpy(a, x[beg:end], incx, b, y[beg:end], incy, z[beg:end], incz)             }         }(i)     }      wg.wait() } 

i'd try parallel version without channels, each worker computes every 8th block, without coordination.


Comments