Hybrid MPI+OpenMP Yao-Yuan Chuang.

Hybrid MPI+OpenMP Yao-Yuan Chuang

SMP Cluster (Hybrid System)
Most modern high performance computing (HPC) systems are clusters of SMP nodes Distributed Memory Parallization (DMP) on the node interconnect Symmetric Multi-Processing (SMP) inside of each node Node Interconnect

Current Solutions for Programming
MPI based: MPP model Massively parallel processing Each CPU = one MPI process Hybrid MPI + OpenMP Each SMP node = one MPI process MPI communication on the node interconnect OpenMP inside of each SMP node DMP with MPI and SMP with OpenMP OpenMP based: Cluster OpenMP Other models: High Performance Fortran Node Interconnect

MPI + OpenMP on SMP Clusters
Could provide highest performance? Advantages Could be effective utilizing heavyweight communications between nodes and lightweight threads within a node Less communication packets and uses larger communication packets than pure MPI on SMP Clusters Disadvantages Very difficult to start with OpenMP and modify for MPI Very difficult to program, debug, modify and maintain Generally, cannot do MPI calls within OpenMP parallel regions Only people very experienced in both should use this mixed programming model Single node and single CPU performance suffers

Hybrid MPI + OpenMP programming
Each MPI process spawns multiple OpenMP threads rank0 mpirun rank1

omphello.c #include <stdio.h> #include <omp.h>
int main(int argc, char *argv[]) { int iam = 0, np = 1; #pragma omp parallel default(shared) private(iam, np) { #if defined (_OPENMP) np = omp_get_num_threads(); iam = omp_get_thread_num(); #endif printf("Hello from thread %d out of %d\n", iam, np); }

omphello.f program ompf ! hello from OMP_threads implicit none
integer nthreads, myrank, omp_get_num_threads integer omp_get_thread_num !$OMP PARALLEL PRIVATE(myrank) SHARED(nthreads) nthreads = omp_get_num_threads() myrank = omp_get_thread_num() if (myrank.eq.0) print *, 'Num threads = ', nthreads print *, ' HELLO ....I am thread # ', myrank !$OMP END PARALLEL end

mpihello.c #include <stdio.h> #include <mpi.h>
int main(int argc, char *argv[]) { int numprocs, rank, namelen; char processor_name[MPI_MAX_PROCESSOR_NAME]; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Get_processor_name(processor_name, &namelen); printf("Process %d on %s out of %d\n", rank, processor_name, numprocs); MPI_Finalize(); }

mpihello.f program f1 !Hello World MPI/F90 style implicit none
include 'mpif.h' integer::myrank, numprocs, ierr, comm = MPI_COMM_WORLD character(80)::str call mpi_init(ierr) call mpi_comm_size(comm, numprocs, ierr) call mpi_comm_rank(comm, myrank, ierr) if (myrank .EQ. 0) print *, 'Num procs ', numprocs call mpi_get_processor_name(str, 80, ierr) print *, 'Hello world : I am processor ', myrank, ':', str call mpi_finalize(ierr) end

Hybrid MPI + OpenMP: mixhello.c
#include <stdio.h> #include "mpi.h" #include <omp.h> int main(int argc, char *argv[]) { int numprocs, rank, namelen; char processor_name[MPI_MAX_PROCESSOR_NAME]; int iam = 0, np = 1; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Get_processor_name(processor_name, &namelen); #pragma omp parallel default(shared) private(iam, np) { np = omp_get_num_threads(); iam = omp_get_thread_num(); printf("Hello from thread %d out of %d from process %d out of %d on %s\n", iam, np, rank, numprocs, processor_name); } MPI_Finalize();

Compile and Execution Compile OPENMP hello
% pgCC omphello.c -o omphello -mp % setenv OMP_NUM_THREADS 4 % ./omphello Compile MPI hello % pgcc mpihello.c -o mpihello -Mmpi % /opt/pgi/linux86/6.2/mpi/mpich/bin/mpirun -np 4 mpihello Compile Hybrid MPI + OpenMP Hello % pgcc mixhello.c -o mixhello -Mmpi % /opt/pgi/linux86/6.2/mpi/mpich/bin/mpirun -np 4 mixhello How about if we forget the –Mmpi ?

Hybrid MPI + OpenMP Calculate p
Each MPI process integrates over a range of width 1/nproc, as a discrete sum of nbin bins each of width step Within each MPI process, nthreads OpenMP threads perform part of the sum as in OPENMP alone.

omppi.c #include <omp.h>
static long num_steps = ; double step; #define NUM_THREADS 2 void main () { int i; double x, pi, sum = 0.0; step = 1.0/(double) num_steps; omp_set_num_threads(NUM_THREADS); #pragma omp parallel for reduction(+:sum) private(x) for (i=1;i<= num_steps; i++){ x = (i-0.5)*step; sum = sum + 4.0/(1.0+x*x); } pi = step * sum;

mpipi.c #include <mpi.h> void main (int argc, char *argv[]) {
int i, my_id, numprocs; double x, pi, step, sum = 0.0 ; step = 1.0/(double) num_steps ; MPI_Init(&argc, &argv) ; MPI_Comm_Rank(MPI_COMM_WORLD, &my_id) ; MPI_Comm_Size(MPI_COMM_WORLD, &numprocs) ; my_steps = num_steps/numprocs ; for (i=my_id*my_steps; i<(my_id+1)*my_steps ; i++) x = (i+0.5)*step; sum += 4.0/(1.0+x*x); } sum *= step ; MPI_Reduce(&sum, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD) ;

mixpi.c #include <mpi.h> #include “omp.h”
void main (int argc, char *argv[]) { int i, my_id, numprocs; double x, pi, step, sum = 0.0 ; step = 1.0/(double) num_steps ; MPI_Init(&argc, &argv) ; MPI_Comm_Rank(MPI_COMM_WORLD, &my_id) ; MPI_Comm_Size(MPI_COMM_WORLD, &numprocs) ; my_steps = num_steps/numprocs ; #pragma omp parallel for private(x) reduction(+:sum) for (i=myrank*my_steps; i<(myrank+1)*my_steps ; i++) x = (i+0.5)*step; sum += 4.0/(1.0+x*x); } sum *= step ; MPI_Reduce(&sum, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD) ; Get the MPI part done first, then add OpenMP pragma

#include <stdio.h> #include <mpi.h> #include <omp.h> #define NBIN #define MAX_THREADS 8 void main(int argc,char **argv) { int nbin,myid,nproc,nthreads,tid; double step,sum[MAX_THREADS]={0.0},pi=0.0,pig; MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD,&myid); MPI_Comm_size(MPI_COMM_WORLD,&nproc); nbin = NBIN/nproc; step = 1.0/(nbin*nproc); #pragma omp parallel private(tid) 　{ 　　int i; 　　double x; 　　nthreads = omp_get_num_threads(); 　　tid = omp_get_thread_num(); 　　for (i=nbin*myid+tid; i<nbin*(myid+1); i+=nthreads) { 　　　　x = (i+0.5)*step; sum[tid] += 4.0/(1.0+x*x);} 　　printf("rank tid sum = %d %d %e\n",myid,tid,sum[tid]); 　} 　for(tid=0; tid<nthreads; tid++) pi += sum[tid]*step; 　MPI_Allreduce(&pi,&pig,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); 　if (myid==0) printf("PI = %f\n",pig); 　MPI_Finalize(); }

Compilation % mpicc –o hpi hpi.c -openmp Execution % mpirun –np 2 hpi Output rank tid sum = e+04 rank tid sum = e+04 rank tid sum = e+04 rank tid sum = e+04 PI =

Comparison From “MPI versus MPI+OpenMP on the IBM SP for the NAS Benchmarks” by F. Cappello and D. Etiemble (2000) The hybrid memory model of clusters of multiprocessors raises two issues: programming model and performance. Many parallel programs have been written by using the MPI standard. To evaluate the pertinence of hybrid models for existing MPI codes, we compare a unified model (MPI) and a hybrid one (OpenMP fine grain parallelization after profiling) for the NAS 2.3 benchmarks on two IBM SP systems. The superiority of one model depends on 1) the level of shared memory model parallelization, 2) the communication patterns and 3) the memory access patterns. The relative speeds of the main architecture components (CPU, memory, and network) are of tremendous importance for selecting one model. With the used hybrid model, our results show that a unified MPI approach is better for most of the benchmarks. The hybrid approach becomes better only when fast processors make the communication performance significant and the level of parallelization is sufficient.

Comparison From “Comparing the OpenMP, MPI, and Hybrid Programming Paradigms on an SMP Cluster” byt G. Jost and H. Jin, NASA Ames Research Center (2003). We have run several implementations of the same CFD benchmark code employing different parallelization paradigms on a cluster of SMP nodes. When using the high-speed interconnect or shared memory, the pure MPI paradigm turned out to be the most efficient. A slow network lead to a decrease in the performance of the pure MPI implementation. The hybrid implementations showed different sensitivity to network speed, depending on the parallelization strategy employed. The benefit of the hybrid implementations was visible on a slow network. The hybrid parallelization approach is suitable for large applications with an inherent multilevel structure, such as multi-zone codes. For codes like the BT benchmark, where parallelization occurs only on one or more of the spatial dimensions, the use of either process level parallelization or OpenMP is, in general, more appropriate. We plan to conduct a study using multi-zone versions of the NAS Parallel Benchmarks [12], which are more suitable for the exploitation of multilevel parallelism.

Cluster OpenMP Yao-Yuan Chuang

What is OpenMP OpenMP is a language for annotating sequential in C, C++, and Fortran Creation of tread teams Redundant execution Cooperative execution Synchronization between threads Barriers, locks, critical sections, single threaded regions Creation of variables private to a thread Implemented through compiler support and a run-time library

Cluster OpenMP Cluster OpenMP is a run-time library tht supports running an OpenMP program on a cluster It is released with the Intel compilers (starting from 9.1) and requires an extra license. Suitable Programs: Programs that scale successfully with OpenMP Programs that have good data locality Programs that use synchronization sparingly

Why Cluster OpenMP? You need higher performance that can achieved using a single node You want to use a cluster programming model easier than using MPI Your program gets excellent speedup with ordinary OpenMP Your program has reasonably good locality of reference and little synchronization

A Simple Cluster OpenMP Program
#include <omp.h> static int x; #pragma intel omp sharable (x) int main() { x = 0; #pragma omp parallel #pragma omp critical x++; } Printf (“%d should equal %d\n”,omp_get_max_threads(),x);

Compiling and Running the Program
Compilation % icc test.c –o test –cluster-openmp % icc test.c –o test –cluster-openmp-profile Assume there are two nodes, node254 and node1, and we run one process per node with 4 threads each % cat kmp_cluster.ini --hostlist=node254,node1 –processes=2 –process_threads=4 The Cluster OpenMP runtime reads kmp_cluster.ini and determines the parameters: hostnames, number of processes, and number of threads per process Should be 8 for previous example

Cluster OpenMP Process Model
Thread 0 Thread 4 Thread 0 Thread 4 Thread 1 Thread 5 Thread 1 Thread 5 Thread 2 Thread 6 Thread 2 Thread 6 Thread 3 Thread 7 Thread 3 Thread 7 Process 0 Process 1 Process 2 Process 3 Node 0 Node 1 Node – a physical machine Process – A linux process Thread – A POSIX thread within a process, implementing an OpenMP thread

Cluster OpenMP Memory Model
If multiple OpenMP threads access a variable, it must be sharable ! Cluster OpenMP Sharable memory Cluster OpenMP Sharable memory Sharable Variables Process sharable memory Process sharable memory Private memory Private memory Process 0 address space Process 1 address space Note: Process sharable memory can be shared between threads in a process Private memory is accessible by a single thread Cluster OpenMP does not provide a single system image

How Cluster OpenMP Program Works
Consistency Protocol is designed to manage memory in Cluster OpenMP. The basic idea: Between OpenMP barriers, data exchange is not necessary, i.e., visibility of data modifications to other threads only after synchronization. When a page of sharable memory is no up-to-date, it becomes protected. Any access then faults (SIGSEGV) into Cluster OpenMP runtime library, which requires information from remote nodes and updates the page. Protection is removed from page. Instruction causing the fault is re-started, this time successfully accessing the data.

Consistency Protocol A A A B B B C C C Node 0 Node 1 Node 2 Write A[1]
Pages: A A A B B B C C C Write A[1] Write C[1] Write B[2] Write A[2] Write B[1] OMP Barrier OMP Barrier OMP Barrier Notices received and propagated by master Thread WriteNotice(0A,2A,2B,0C) WriteNotice(0A,1B,0C) WriteNotice(2A,2B) WriteNotice(1B) node page Page Fault Read A[1] Re-Read A[1] Calculate Diffs(A) Calculate Diffs(A)

MPI Based Parallelization vs. DSM
MPI based: (hybrid MPI + OpenMP) Potential of boundary exchange between two domains in one large message, dominated by bandwidth of the network DSM based: (OpenMP only) Additional latency based overhead in each barrier Communication of updated data of pages Not all of this data may be needed Packages may be too small Significant latency Communication not oriented on boundaries of a domain decomposition Probably more data must be transferred than necessary Communication might be 10 times slower than with MPI

Sharable Variables OpenMP Cluster OpenMP
All variables are sharable except threadprivate variables Sharable variables are variables that either: Are used in a shared way in a parallel region and allocated in an enclosing scope in the same routine Appear in a sharable directive The compiler automatically applies these assumptions when –cluster-openmp or –cluster-openmp-profile is specified. It automatically makes the indicated variables sharable. All other variables are nonsharable by default.

Sharable Variables All variables in a shared clause or implicitly shared must be made sharable, except for system variables Sharable variable can be specified by #pragma intel omp sharable (var) //C, C++ !dir$ omp sharable (var) ! Fortran

Before Porting Verify that your code works correctly with OpenMP
Try running with the –cluster-openmp option Use –clomp-sharable-propagation option to identify the sharable variables Compile all source files using –clomp-sharable-propagation and –ipo Insert the indicated sharable directives in your code Rebuild and execute the program.

Example: pi.f and pi2.f pi.f pi2.f double precision pi integer nsteps
call compute (nsteps, pi) print *, nsteps, pi end subroutine calcpi(nsteps,pi,sum) double precision pi,sum,step double precision x step= 1.0d0/nsteps sum = 0.0d0 !$omp parallel private(x) !$omp do reduction (+:sum) do i = 1, nsteps x = (i – 0.5d0)*step sum = sum + 4.0d0/(1.0d0 + x*x) end do !$omp end do !$omp end parallel pi = step * sum subroutine compute(nsteps,pi) double precision pi,sum call calcpi(nsteps,pi,sum)

Example: pi.f and pi2.f To find the variables that must be declared sharable, use the following command % ifort –cluster-openmp –clomp-sharable –propagation pi.f pi2.f –ipo The resulting compiler warnings IPO: perform multi-file optimizations IPO: generating object file /tmp/ipo-ifortqKrZN4.o fortcom: Warning: Sharable directive should be inserted by user as ‘!dir$ omp sharable(nsteps)’ in file pi.f, line 2, column 16 fortcom: Warning: Sharaable directive should be inserted by users as ‘!dir$ imp sharable(sum)’ in file pi2.f line 2, column 29 pi.f(18) : (col. 6) remark: OpenMP DEFINED LOOP WAS PARALLELIZED. pi2.f(17) : (col. 6) remark: OpenMP DEFINED REGION WAS PARALLELIZED.

Example: pi.f and pi2.f pi.f pi2.f double precision pi integer nsteps
!dir$ omp sharable(nsteps) nsteps = call compute (nsteps, pi) print *, nsteps, pi end subroutine calcpi(nsteps,pi,sum) double precision pi,sum,step double precision x step= 1.0d0/nsteps sum = 0.0d0 !$omp parallel private(x) !$omp do reduction (+:sum) do i = 1, nsteps x = (I – 0.5d0)*step sum = sum + 4.0d0/(1.0d0 + x*x) end do !$omp end do !$omp end parallel pi = step * sum subroutine compute(nsteps,pi) double precision pi,sum !dir$ omp sharable(sum) call calcpi(nsteps,pi,sum)

Using the Disjoint Heap
If a heap block is misused, then the program will issue a SIGSEGV immediately, rather than continuing to execute with wrong value Invalid Address Space Address Space Heap Heap expansion p0 p1 p2 p0 p1 p2 (a) Normal Heap Address Space (b) Disjoint Heap Address Space

Disjoint Heap To enable the disjoin heap by setting environment variable KMP_DISJOINT_HEAPSIZE to a size with ‘K’ for 1024 bytes, M for 1024*1024 bytes. The minimum is 2MB. compile with “-g” % setenv KMP_DISJOINT_HEAP 128M run program, if you get an error (segmenttion fault ip=0x400c2f) Use addr2line to find the source line % addr2line –e switch.exe 0x400c2f Add kmp_sharable_malloc to the variable in that line The total address space consumed by the disjoint heap is the size you set for KMP_DISJOINT_HEAPSIZE multiply by the number of threads. If any process in your program uses more heap space than is allocated for the disjoint heap, an error message appears.

Language Specific Porting Steps
For each language, it is important to check for the share d use of dynamically allocated memory. Fortran Code Use –clomp-sharable-commons, -clomp-sharable-localsaves, and –clomp-sharable-argexprs to isolate the offending variables. The ALLOCATABLE variable can be shared defined in a sharable directive C and C++ Code Use kmp_sharable_malloc instead of malloc, use kmp_sharable_free instead of free

Use default(none) clause to find sharable variable
If your program does not function correctly, use default (none) clause to find variable that need to be made sharable. Place a default(none) clause on a parallel directive that seems to reference a non-sharable variable in a shared way. Add variable mentioned in the messages to a private or shared clause for the parallel region and recompile. Use the –clomp-sharable-info option to report all variables automatically and promoted to sharable. Verify all variables in the shared clause are either in a –clomp-sharable-info message or in an explicit sharable directive. For C/C++ program, verify that data shared by dereferencing a pointer is made sharable.

Fortran Consideration
In Fortran, the sharable directive must be placed in the declaration part of a routine. Common block cannot appear in the sharable directive variable list. But the common block name can. !dir$ omp sharable (/cname/) Variables in an EQUIVALENCE statement cannot appear in the sharable directive list. Option Description -clomp-sharable-argexprs An argument to any subroutine or function call is assigned to a temporary value located in the sharable memory. -clomp-sharable-commons All common blocks are placed in the sharable memory. -clomp-sharable-localsaves All variables declared in the subroutines or functions with SAVE attribute are placed in the sharable memory -clomp-sharable-modvars All variables declared in modules are placed in sharable memory. Defaults values: -no-clomp-sharable-argexprs -no-clomp-sharable-commons -no-clomp-sharable-localsaves -no-clomp-sharable-modvars

Fortran Consideration
Original code Use Option Change to common /blk/ a(100) !dir$ omp sharable (/blk/) common /blk/ a(100) -clomp-sharable-commons real a(100) save a !dir$ omp sharable (a) real a(100) save a -clomp-shrable-localsaves module m real a(100) module m Real a(100) !dir$ omp sharable (a) -clomp-shrable-modvars

Running a Cluster OpenMP Program
Verify that a kmp_cluster.ini file exists in the current directory Run the configuration checker script % clomp_configchecker.pl program_name Check kmp_cluster.ini file Ping each node and test the rsh/ssh command Confirms the existence of the executable on each node Verifies the library compatibility Send warning messages if any inconsistency Creates a log file, clomp_configchecker.log Type the name of the executable to execute the program

Cluster OpenMP Initialization File
The kmp_cluster.ini file consists of Option line, Environment variable section, and Comments The PATH, SHELL, and LD_LIBRARY_PATH are not allowed Option Default Description processes=integer if value omp_num_threads is set then it is equal to omp_num_threads/process_threads, otherwise equal to number of hosts Number of processes to use process_threads=integer 1 Number of threads per process omp_num_threads=integer process * process_threads Number of OpenMP threads hostlist=host1,host2… master node List of hosts hostfile=filename List of hosts in file launch=keyword rsh Rsh or ssh sharable_heap=integer [K/M/G] 256M Size of sharable memory startup_timeout= 30 Num of Sec. for remote node starts [no-]heartbeat Heartbeat Check remote alive

Cluster OpenMP Environment Variables
Default Value Description KMP_STACKSIZE 1M The stacksize of each principle thread KMP_SHARABLE_STACKSIZE Size of stack used for sharable data on each thread KMP_STATSFILE Guide.gvs Statistics file build with –cluster-openmp-profile KMP_CLUSTER_DEBUGGER None Debugger executable name KMP_WARNINGS On Turns on/off warning KMP_SHARABLE_WARNINGS Off Turns on/off warning on sharable variable KMP_CLUSTER_SETTING Output values in kmp_culster.ini and environment variables KMP_CLUSTER_PATH Where the .kmp_cluster file located KMP_CLUSTER_HELP Output kmp_cluster.ini file KMP_VERSION Dump its value at runtime KMP_DISJOINT_HEAPSIZE Size of disjoint heap in K or M

Cluster OpenMP API Routines
Description void *kmo_sharable_malloc(size_t size) Allocate sharable memory void *kmo_aligned_sharabl_malloc(size_t size) Allocate sharable memory in a page boundary void *kmp_sharable_realloc(void *ptr, size_t size) De-allocate previously allocated memory void kmp_sharable_free (void *ptr) Free sharable memory void kmp_set_warnings_on(void) Enable rin-time warnings void kmp_set_warnings_off(void) Disable run-time warnings omp_int_t kmp_get_process_num(void) Return the proces number of the current process omp_int_t kmo_get_num_processes(void) Return the number of processes omp_int_t kmp_get_process_thread_num(void) Return the thread number of the current thread with respect to current process

Allocating Sharable Memory at Run-Time
void *kmp_sharable_malloc(int size); void *kmp_aligned_sharable_malloc(int size); void kmp_sharable_free (void *ptr) C++ #include <kmp_sharable.h> foo *fp = new kmp_sharable foo(10); //class foo class foo : public foo_base, public kmp_sharable_base { } Fortran integer, allocatable :: x(:) !dir$ omp sharable (x) allocate(x(500)) ! Allocates x in sharable memory

Tools for Use with Cluster OpenMP
Correctness Tools: Intel Thread Checker – find variables that should be made sharable Intel Compiler – find variables that should be made sharable Inted Debugger – debugger Performance Tools: Segvprof – shows where SEGV hot spots Intel Trace Analyzer – displays message trafic between nodes Cluster OpenMP dashboard – real-time view of page Intel Thread Profiler – displays OpenMP performance information

Intel Thread Checker Support in version 3.1 Used as follows:
% setenv TC_PREVIEW 1 % setenv TC_OPTIONS [verbose,] shared % setenv KMP_FOR_TCHECK 1 setup kmp_cluster.ini with “--process=1 --process_threads=4” compile the application with –g % tcheck_cl [-c] <executable> <executable args> -c to clear the instumentation cache

Intel Debugger % setenv IDB_PARALLEL_SHELL <path yo your ssh>
% setenv IDB_PARALLEL_SHELL /usr/bin/ssh Source the idbvars.csh script in /opt/intel/idb/<platform>/idbvars.csh Add the following to kmp_cluster.ini --no_heartbeat --IO=system --startup_timeout=60000 Compile the application with –g Setup IDB_HOME to be the directory where idb resides % setenv IDB_HOME /opt/intel/idb/<platform>/ Use –clomp and full pathname to executable % idb –clomp /home/ychuang/test.exe

IDB Tips The program begins running automatically when you enter IDB
When it prints a prompt, that means it stopped at an automatic breakpoint, use “continue” “print” is available for printing values of sharable variables “break” is available Showing the OpenMP team information is not supported

Segvprof.pl Compile with –g to get line level profiles
Set the environment variable KMP_CLUSTER_PROFILE to 1 Run the code Analyze the *.gmon files with segvprof.pl % segvprof.pl –e <executable> *.gmon Reports shows the number of SEGVs which occurred at each line Sorted by region if –cluster-openmp-profile was used at compile time otherwise for the whole program

Intel Trace Analyzer/Collector
% setenv KMP_TRACE 1 Run application % traceanalyzer <application>.stf Add user events to your code: VT_funcdef(char *name,0,int *handle); ex: VT_funcdef(“INIT”,0,&init_handle); VT_begin(int handle); ex: VT_begin(init_handle); VT_end(int handle); ex: VT_end(init_handle); Show the event timeline and then ungroup the application thread to reveal the user-defined functions on the timeline and see the time listed in the thread window

Intel Trace Analyzer/Collector

Cluster OpenMP Dashboard
% setenv KMP_CLUSTER_DISPLAY :0.0 Add the fillowing to the kmp_cluster.ini file: KMP_CLUSTER_DISPLAY=:0.0 For example: --processes=2 –process_threads=4 –launch=ssh Run the application as normal

Intel Thread Profiler Compile with –cluster-openmp-profile
Creates a file guide.gvs Open that file using the Windows* GUI Use –p with clomp_forcaster.pl to create a sorted, summarized CSV file from the .gvs file, for use with a spreadsheet

Cluster OpenMP Resources
Example programs and tools can be found at Intel devloper site at the clomp_tools.tar.gz contains under Intel C++ Compiler, Linux* Cluster OpenMP Intel Fortran Compiler, Linux* Cluster OpenMP Contains serveral perl scripts: clomp_getlatency.pl – measure latency to remote node clomp_configchecker.pl- check configuration clomp_forecaster.pl – estimate program performance segvprof.pl – display per line/function profile information Example codes and kmp_cluster.ini file

References “OpenMP on MPPs and clusters of SMP nodes using Intel Compilers with Cluster OpenMP” by R. Rabenseifner, B. Krammer, 2006. “Cluster OpenMP* user’s guide version 9.1”, Intel Corporation, 2005. “Intel Cluster OpenMP*” L. Meadows, Intel, 2007.

Hybrid MPI+OpenMP Yao-Yuan Chuang.

Similar presentations

Presentation on theme: "Hybrid MPI+OpenMP Yao-Yuan Chuang."— Presentation transcript:

Similar presentations

About project

Feedback

Log in

Auth with social network:

Hybrid MPI+OpenMP Yao-Yuan Chuang.

Similar presentations

Presentation on theme: "Hybrid MPI+OpenMP Yao-Yuan Chuang."— Presentation transcript:

Similar presentations

About project

Feedback