Wednesday, November 15, 2006

PPE-SPE Synchronization

Cell provides mailboxes in order to nicely synchronize SPEs and PPE. However, there is something that should be taken into account by programmers who care about performances. When the SPE use mailboxes to communicate with the PPE, the latter performs a DMA transaction for each tentative read of the mailbox. The end result is that a lot of bus traffic is generated, which might adversely impacts performances. Another approache that, while less user friendly, ensure maximum performances is tu rely on Spinlocks and DMA transfers. The basic idea is to have the PPE and the SPE coordinate by writing some agreed areas of the main memory and local store respectively. The code below shows how you might achieve this. Actually the sample code, allows you to measure the performance of this spinlock synchronization, which you can easily compare with mailbox based synchronization.

---------- SPE Code ------------
#include <stdio.h>
#include <stdlib.h>

#include <libspe.h>
#include <cbe_mfc.h>

// -- Common Include --
#include "cbench/common.h"

extern spe_program_handle_t cbench_spinlock_spu;

volatile unsigned long long spinlock     __attribute__((aligned(128)));
unsigned long long          spinlock_spu __attribute__((aligned(128)));

int main(int argc, char* argv[])
  speid_t speid;
  int     status;
  int     tagid = 1;

  unsigned int run;

  char TEST_ID[32] ="SPINLOCK:PPU>";
  if (argc < 2) {
      printf("USAGE:\n\tspinlock_spu <sync-num>\n");
      return 1;

  run = atoi(argv[1]);
  if (run == 0) {
      run = 1;

  printf("%s PPU Spinlock at [0x%p]\n", TEST_ID, &spinlock);
  spinlock = 0;

  speid = spe_create_thread( 0,
                 (unsigned long long*)&spinlock,
                 (unsigned long long*)run,
                 0 );
  if(speid == 0){
    perror( "Unable to create SPE thread\n");
    return -1;
  unsigned int i;

  for (i = 0; i < run; ++i) {
      while (spinlock == 0) { }
      spinlock_spu = spinlock;
      spinlock = 0;

      /* Now the spinlock contains the LS address for the SPU spinlock */

  spe_wait(speid, &status, 0);
  printf("%s DONE!\n", TEST_ID);
  return 0;

--------- SPE Code --------------
#include <libsim.h>
#include <sim_printf.h>
#include <spu_intrinsics.h>
#include <spu_mfcio.h>
#include <cbe_mfc.h>
#include <profile.h>

volatile unsigned long long spinlock     __attribute__ ((aligned (128)));

 * This has to be an unsigned long long, just because we need to write
 * this much on the destination spinlock.
unsigned long long spinlock_spu_ls __attribute__ ((aligned (128)));
unsigned int       spinlock_ppu_ea __attribute__ ((aligned (128)));

int main(unsigned long long spuid,
     addr64 argp,
     addr64 envp)
  int tag_id = 0;

  char TEST_ID[32] ="SPINLOCK:SPU>";

  spinlock_ppu_ea = argp.ui[1];
  spinlock_spu_ls = (unsigned int)&spinlock;

  spinlock = 0;

  unsigned int run = envp.ui[1];

  sim_printf("%s TEST STARTED\n", TEST_ID);
  sim_printf("%s Performing %u Spinlock measurements\n", TEST_ID, run);
  sim_printf("%s PPU Spinlock at  [0x%x]\n", TEST_ID, spinlock_ppu_ea);
  sim_printf("%s SPU Spinlock at  [0x%llx]\n", TEST_ID, spinlock_spu_ls);
  unsigned int i;
  for (i = 0; i < run; ++i) {
       * Write the SPU Spinlock EA into the PPU spinlock EA.
           (unsigned int)spinlock_ppu_ea,
       * Wait for the PPU to activate an SPU initiated DMA to set the spinlock.
      while (spinlock == 0) { }

      sim_printf("======================[%d]=========================\n", i);
      spinlock = 0;
  sim_printf("%s TEST COMPLETED\n", TEST_ID);

  return 0;

No comments: