#include <stdio.h>
#include "genC.h"
#include "linear.h"
#include "ri.h"
#include "effects.h"
#include "ri-util.h"
#include "effects-util.h"
#include "misc.h"
#include "effects-generic.h"
#include "effects-simple.h"
#include "control.h"
#include "callgraph.h"
#include "pipsdbm.h"
#include "accel-util.h"
#include "resources.h"
#include "properties.h"
#include "prettyprint.h"

Include dependency graph for gpu-ify.c:

Functions
static const char *	clean_prefix (const char full_name, const char bad_prefix)
	Return a pointer on the first char after the bad_prefix. More...

static const char *	get_clean_mod_name (const char *mod_name)
	Trying to get only the original function name without prefix. More...

string	build_outline_name (const char base_prefix, const char mod_name)
	Build the outline function name. More...

static bool	mark_loop_to_outline (const statement s)

static void	gpu_ify_statement (statement s, int depth, const char *mod_name)
	Transform a loop nest into a GPU or accelerator-like kernel. More...

bool	gpu_ify (const string mod_name)
	Transform all the parallel loop nests of a module into smaller independent functions suitable for GPU-style accelerators. More...

Variables
static list	loop_nests_to_outline
	A simple phase that outlines parallel loops onto GPU. More...

static const char *	kernel_prefix = 0
	These are the possibles prefixes for outline stuff, they are computed from a property and the current module name. More...

static const char *	wrapper_prefix = 0

static const char *	launcher_prefix = 0

static const char *	fwrapper_prefix = 0

Function Documentation

◆ build_outline_name()

string build_outline_name	(	const char *	base_prefix,
		const char *	mod_name
	)

Build the outline function name.

Warning! Do not modify this file that is automatically generated!

Parameters

base_prefix	ase_prefix
mod_name	od_name

Definition at line 78 of file gpu-ify.c.

                                                             {
   bool name_suffix_p = get_bool_property("GPU_OUTLINE_SUFFIX_WITH_OWNER_NAME");
  
   char *prefix;
   if(name_suffix_p) {
     // strdup because concatenate is used during build_new_top_level_module_name
     prefix = strdup(concatenate(base_prefix,"_",mod_name,NULL));
   } else {
     prefix = strdup(base_prefix);
   }
  
   string outline_name = build_new_top_level_module_name(prefix,true);
  
   free(prefix);
  
   return outline_name;
 }

References build_new_top_level_module_name(), concatenate(), free(), get_bool_property(), prefix, and strdup().

Referenced by get_next_task_name(), and gpu_ify_statement().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ clean_prefix()

static const char* clean_prefix	(	const char *	full_name,
		const char *	bad_prefix
	)

static

Return a pointer on the first char after the bad_prefix.

Definition at line 45 of file gpu-ify.c.

                                                                                {
   int len = strlen(bad_prefix);
   if(strncasecmp(full_name,bad_prefix,len)==0) {
     full_name = full_name+len;
   }
   // Jump over separator
   if(*full_name=='_') full_name++;
   return full_name;
 }

References full_name.

Referenced by get_clean_mod_name().

Here is the caller graph for this function:

◆ get_clean_mod_name()

static const char* get_clean_mod_name ( const char * mod_name )

static

Trying to get only the original function name without prefix.

Definition at line 59 of file gpu-ify.c.

                                                             {
  
   kernel_prefix   = get_string_property("GPU_KERNEL_PREFIX");
   launcher_prefix = get_string_property("GPU_LAUNCHER_PREFIX");
   wrapper_prefix  = get_string_property("GPU_WRAPPER_PREFIX");
   fwrapper_prefix = get_string_property("GPU_FORTRAN_WRAPPER_PREFIX");
  
   const char * clean_mod_name = mod_name;
  
   clean_mod_name = clean_prefix(clean_mod_name,launcher_prefix);
   clean_mod_name = clean_prefix(clean_mod_name,fwrapper_prefix);
   clean_mod_name = clean_prefix(clean_mod_name,wrapper_prefix);
   clean_mod_name = clean_prefix(clean_mod_name,kernel_prefix);
   return clean_mod_name;
 }

References clean_prefix(), fwrapper_prefix, get_string_property(), kernel_prefix, launcher_prefix, and wrapper_prefix.

Referenced by gpu_ify().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ gpu_ify()

bool gpu_ify ( const string mod_name )

Transform all the parallel loop nests of a module into smaller independent functions suitable for GPU-style accelerators.

What can be done is more detailed in gpu_ify_statement(). The various functions are generated or not according to different properties.

Parameters

module_name is the name of the module to work on.

Returns: true since it should succeed...

Outline the previous marked loop nests. First put the statements to outline in the good order:

Clean module name from prefix

Parameters

mod_name od_name

Definition at line 347 of file gpu-ify.c.

                                     {
   // Use this module name and this environment variable to set
   statement module_statement = PIPS_PHASE_PRELUDE(mod_name,
                                                   "GPU_IFY_DEBUG_LEVEL");
  
   // Get the effects and use them:
   set_cumulated_rw_effects((statement_effects)db_get_memory_resource(DBR_CUMULATED_EFFECTS,mod_name,true));
  
   // Initialize the loop nest set to outline to the empty set yet:
   loop_nests_to_outline = NIL;
  
   // Mark interesting loops:
   gen_recurse(module_statement,
               statement_domain, mark_loop_to_outline, gen_null);
  
   /* Outline the previous marked loop nests.
      First put the statements to outline in the good order: */
   loop_nests_to_outline = gen_nreverse(loop_nests_to_outline);
  
   /* Clean module name from prefix */
   const char* clean_mod_name=get_clean_mod_name(global_name_to_user_name(entity_name(get_current_module_entity())));
  
   FOREACH(STATEMENT, s, loop_nests_to_outline) {
     // We could have stored the depth, but it complexifies the code...
     gpu_ify_statement(s, depth_of_parallel_perfect_loop_nest(s),clean_mod_name);
   }
  
   gen_free_list(loop_nests_to_outline);
  
   // No longer use effects:
   reset_cumulated_rw_effects();
  
   // We may have outline some code, so recompute the callees:
   DB_PUT_MEMORY_RESOURCE(DBR_CALLEES, mod_name,
                          compute_callees(get_current_module_statement()));
  
   // Put back the new statement module
   PIPS_PHASE_POSTLUDE(module_statement);
   // The macro above does a "return TRUE" indeed.
 }

References compute_callees(), db_get_memory_resource(), DB_PUT_MEMORY_RESOURCE, depth_of_parallel_perfect_loop_nest(), entity_name, FOREACH, gen_free_list(), gen_nreverse(), gen_null(), gen_recurse, get_clean_mod_name(), get_current_module_entity(), get_current_module_statement(), global_name_to_user_name(), gpu_ify_statement(), loop_nests_to_outline, mark_loop_to_outline(), module_statement, NIL, PIPS_PHASE_POSTLUDE, PIPS_PHASE_PRELUDE, reset_cumulated_rw_effects(), set_cumulated_rw_effects(), STATEMENT, and statement_domain.

Here is the call graph for this function:

◆ gpu_ify_statement()

static void gpu_ify_statement	(	statement	s,
		int	depth,
		const char *	mod_name
	)

static

Transform a loop nest into a GPU or accelerator-like kernel.

Parameters

s	is the parallel loop-nest statement
depth	is the number of loop in the loop nest to be taken out as the GPU iterators

Several properties can be used to change the behviour of this function, as explained in pipsmake-rc

For example is depth = 2 and s is: for(i = 1; i <= 499; i += 1) for(j = 1; j <= 499; j += 1) save[i][j] = 0.25*(space[i-1][j]+space[i+1][j]+space[i][j-1]+space[i][j+1]);

it generates something like: [...] If the GPU_USE_LAUNCHER property is true, this kind of function is generated: void p4a_kernel_launcher_0(float_t save[501][501], float_t space[501][501]) { int i; int j; for(i = 1; i <= 499; i += 1) for(j = 1; j <= 499; j += 1)

p4a_kernel_wrapper_0(save, space, i, j); }

If the GPU_USE_WRAPPER property is true, this kind of function is generated: void p4a_kernel_wrapper_0(float_t save[501][501], float_t space[501][501], int i, int j) { To be assigned to a call to P4A_vp_0: i To be assigned to a call to P4A_vp_1: j p4a_kernel_0(save, space, i, j); }

If the GPU_USE_KERNEL property is true, this kind of function is generated: void p4a_kernel_0(float_t save[501][501], float_t space[501][501], int i, int j) { save[i][j] = 0.25*(space[i-1][j]+space[i+1][j]+space[i][j-1]+space[i][j+1]); }

Other properties modify the behaviour: GPU_USE_KERNEL_INDEPENDENT_COMPILATION_UNIT, GPU_USE_LAUNCHER_INDEPENDENT_COMPILATION_UNIT, GPU_USE_WRAPPER_INDEPENDENT_COMPILATION_UNIT, GPU_COORDINATE_INTRINSICS_FORMAT, GPU_USE_FORTRAN_WRAPPER

Look at pipsmake-rc documentation.

If we want to outline a kernel:

First outline the innermost code (the kernel itself) to avoid spoiling its memory effects if we start with the outermost code first. The kernel name with a prefix defined in the GPU_KERNEL_PREFIX property:

Do we need to insert a wrapper phase to reconstruct iteration coordinates from hardware intrinsics?

Add index initialization from GPU coordinates, in the reverse order since we use insert_comments_to_statement() to avoid furthering the first statement from its original comment:

Add a comment to know what to do later:

Map the inner loop index (numbered i) with the lower GPU coordinate (numbered depth - 1 - i)). In this way, if the code was cache-friendly, it should remain GPU-memory friendly

Build the intrinsics of this form: P4A_vp_<depth - 1 - i>

Add a comment in the form of

To be replaced with a call to P4A_vp_1: j

that may replaced by a post-processor later by

j = P4A_vp_1(); or whatever according to the target accelerator

Then outline the innermost code again (the kernel wrapper) that owns the kernel call. The kernel wrapper name with a prefix defined in the GPU_WRAPPER_PREFIX property:

Here we check if we had requested to outline a kernel previously, and we ensure that if the wrapper wasn't generated in a new compilation unit, then it should be added in the same compilation unit as the kernel. It won't be declared in the compilation unit, but if the kernel have been generated in a new compilation unit, there is no PARSED_CODE resource available and thus we can't use AddEntityToCompilationUnit()

Outline the kernel launcher with a prefix defined in the GPU_LAUNCHER_PREFIX property:

Definition at line 199 of file gpu-ify.c.

                                                                 {
   ifdebug(1) {
     pips_debug(1, "Parallel loop-nest of depth %d\n", depth);
     print_statement(s);
   }
   // Get the statement inside the loop-nest:
   statement inner = perfectly_nested_loop_to_body_at_depth(s, depth);
  
   // Save the value of a property we are going to change locally:
   bool old_outline_independent_compilation_unit =
     get_bool_property("OUTLINE_INDEPENDENT_COMPILATION_UNIT");
  
   /* If we want to outline a kernel: */
   string kernel_name = string_undefined;
   if (get_bool_property("GPU_USE_KERNEL")) {
     /* First outline the innermost code (the kernel itself) to avoid
        spoiling its memory effects if we start with the outermost code
        first. The kernel name with a prefix defined in the
        GPU_KERNEL_PREFIX property: */
     list sk = CONS(STATEMENT, inner, NIL);
     // Choose if we want the kernel in its own file:
     set_bool_property("OUTLINE_INDEPENDENT_COMPILATION_UNIT",
                       get_bool_property("GPU_USE_KERNEL_INDEPENDENT_COMPILATION_UNIT"));
     kernel_name = build_outline_name(kernel_prefix, mod_name);
     outliner(build_outline_name(kernel_prefix, mod_name),sk);
     //insert_comments_to_statement(inner, "// Call the compute kernel:");
   }
  
   /* Do we need to insert a wrapper phase to reconstruct iteration
      coordinates from hardware intrinsics? */
   if (get_bool_property("GPU_USE_WRAPPER")) {
     /* Add index initialization from GPU coordinates, in the reverse order
        since we use insert_comments_to_statement() to avoid furthering the
        first statement from its original comment: */
     for(int i = depth - 1; i >= 0; i--) {
       entity index = perfectly_nested_loop_index_at_depth(s, i);
       // Get the iteration coordinate intrinsic, for example P4A_vp_1:
       /*
         This code makes a
 resource SUMMARY_EFFECTS[p4a_kernel_launcher_1] is in 'required' status since 149
 resource CUMULATED_EFFECTS[p4a_kernel_launcher_1] is in 'required' status since 152
 resource PROPER_EFFECTS[p4a_kernel_launcher_1] is in 'required' status since 152
 resource SUMMARY_EFFECTS[p4a_kernel_wrapper_1] is in 'required' status since 152
 resource CUMULATED_EFFECTS[p4a_kernel_wrapper_1] is in 'required' status since 155
 resource PROPER_EFFECTS[p4a_kernel_wrapper_1] is in 'required' status since 155
 user error in rmake: recursion on resource SUMMARY_EFFECTS of p4a_kernel_wrapper_1
       statement assign = make_assign_statement(entity_to_expression(index),
                                                MakeUnaryCall(get_coordinate_intrinsic(i),
      entity_to_expression(index)));
      So keep simple right now
       */
  
       /* Add a comment to know what to do later: */
       string comment;
       string intrinsic_name;
       /* Map the inner loop index (numbered i) with the lower GPU
          coordinate (numbered depth - 1 - i)). In this way, if the code
          was cache-friendly, it should remain GPU-memory friendly
  
          Build the intrinsics of this form: P4A_vp_<depth - 1 - i>
       */
       asprintf(&intrinsic_name,
                get_string_property("GPU_COORDINATE_INTRINSICS_FORMAT"),
                depth - 1 - i);
       /* Add a comment in the form of
  
          To be replaced with a call to P4A_vp_1: j
  
          that may replaced by a post-processor later by
  
          j = P4A_vp_1();
          or whatever according to the target accelerator
       */
       asprintf(&comment, "%s To be assigned to a call to %s: %s\n",
                c_module_p(get_current_module_entity()) ? "//" : "C",
                intrinsic_name,
                entity_user_name(index));
       free(intrinsic_name);
       insert_comments_to_statement(inner, comment);
     }
  
     /* Then outline the innermost code again (the kernel wrapper) that owns
        the kernel call. The kernel wrapper name with a prefix defined in the
        GPU_WRAPPER_PREFIX property: */
     list sk = CONS(STATEMENT, inner, NIL);
     // Choose if we want the wrapper in its own file:
     set_bool_property("OUTLINE_INDEPENDENT_COMPILATION_UNIT",
                       get_bool_property("GPU_USE_WRAPPER_INDEPENDENT_COMPILATION_UNIT"));
     string wrapper_name = build_outline_name(wrapper_prefix, mod_name);
     outliner(wrapper_name, sk);
  
     /* Here we check if we had requested to outline a kernel previously, and we
      * ensure that if the wrapper wasn't generated in a new compilation unit,
      * then it should be added in the same compilation unit as the kernel.
      * It won't be declared in the compilation unit, but if the kernel have been
      * generated in a new compilation unit, there is no PARSED_CODE resource
      * available and thus we can't use AddEntityToCompilationUnit()
      */
     if(kernel_name && !string_undefined_p(kernel_name)
         && !get_bool_property("GPU_USE_WRAPPER_INDEPENDENT_COMPILATION_UNIT")) {
       string source_file_name =
         db_get_memory_resource(DBR_USER_FILE, kernel_name, true);
       DB_PUT_FILE_RESOURCE(DBR_USER_FILE, wrapper_name, strdup(source_file_name));
     }
  
     //insert_comments_to_statement(inner, "// Call the compute kernel wrapper:");
   }
  
   if (get_bool_property("GPU_USE_LAUNCHER")) {
     /* Outline the kernel launcher with a prefix defined in the
        GPU_LAUNCHER_PREFIX property: */
     if(get_bool_property("GPU_IFY_ANNOTATE_LOOP_NESTS")) {
       // Annotate loop nest now, so that we know which are parallel !
       bool gpu_loop_nest_annotate_on_statement(statement s);
       gpu_loop_nest_annotate_on_statement(s);
     }
     list sl = CONS(STATEMENT, s, NIL);
     statement st;
     // Choose if we want the launcher in its own file:
     set_bool_property("OUTLINE_INDEPENDENT_COMPILATION_UNIT",
                       get_bool_property("GPU_USE_LAUNCHER_INDEPENDENT_COMPILATION_UNIT"));
     st = outliner(build_outline_name(launcher_prefix, mod_name), sl);
     if (get_bool_property("GPU_USE_FORTRAN_WRAPPER")) {
       string fwp = strdup(concatenate(fwrapper_prefix,"_",mod_name,NULL));
       ifdebug(3) {
         pips_debug(1, "Outline Fortan_wrapper with prefix %s\n", fwp);
       }
       outliner (build_new_top_level_module_name(fwp, true),CONS(STATEMENT,st,NIL));
           free(fwp);
     }
     //insert_comments_to_statement(inner, "// Call the compute kernel launcher:");
   }
   // Restore the original property value:
   set_bool_property("OUTLINE_INDEPENDENT_COMPILATION_UNIT",
                     old_outline_independent_compilation_unit);
 }

References asprintf, build_new_top_level_module_name(), build_outline_name(), c_module_p(), comment(), concatenate(), CONS, db_get_memory_resource(), DB_PUT_FILE_RESOURCE, depth, entity_user_name(), free(), fwrapper_prefix, get_bool_property(), get_current_module_entity(), get_string_property(), gpu_loop_nest_annotate_on_statement(), ifdebug, insert_comments_to_statement(), kernel_prefix, launcher_prefix, NIL, outliner(), perfectly_nested_loop_index_at_depth(), perfectly_nested_loop_to_body_at_depth(), pips_debug, print_statement(), set_bool_property(), STATEMENT, strdup(), string_undefined, string_undefined_p, and wrapper_prefix.

Referenced by gpu_ify().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mark_loop_to_outline()

static bool mark_loop_to_outline ( const statement s )

static

An interesting loop must be parallel first...

We recurse on statements instead of loops in order to pick informations on the statement itself, such as pragmas

Since we only outline outermost loop-nest, stop digging further in this statement:

Definition at line 120 of file gpu-ify.c.

                                         {
   /* An interesting loop must be parallel first...
  
      We recurse on statements instead of loops in order to pick
      informations on the statement itself, such as pragmas
   */
   if(statement_loop_p(s)) {
     int parallel_loop_nest_depth = depth_of_parallel_perfect_loop_nest(s);
     ifdebug(3) {
       pips_debug(1, "Statement %td with // depth %d\n", statement_number(s),
           parallel_loop_nest_depth);
       print_statement(s);
     }
     if (parallel_loop_nest_depth > 0) {
       // Register the loop-nest (note the list is in the reverse order):
       loop_nests_to_outline = CONS(STATEMENT, s, loop_nests_to_outline);
       /* Since we only outline outermost loop-nest, stop digging further in
          this statement: */
       pips_debug(1, "Statement %td marked to be outlined\n", statement_number(s));
       return false;
     }
   }
   // This statement is not a parallel loop, go on digging:
   return true;
 }

References CONS, depth_of_parallel_perfect_loop_nest(), ifdebug, loop_nests_to_outline, pips_debug, print_statement(), STATEMENT, statement_loop_p(), and statement_number.

Referenced by gpu_ify().

Here is the call graph for this function:

Here is the caller graph for this function:

Variable Documentation

◆ fwrapper_prefix

const char* fwrapper_prefix = 0

static

Definition at line 41 of file gpu-ify.c.

Referenced by get_clean_mod_name(), and gpu_ify_statement().

◆ kernel_prefix

const char* kernel_prefix = 0

static

These are the possibles prefixes for outline stuff, they are computed from a property and the current module name.

Definition at line 38 of file gpu-ify.c.

Referenced by get_clean_mod_name(), and gpu_ify_statement().

◆ launcher_prefix

const char* launcher_prefix = 0

static

Definition at line 40 of file gpu-ify.c.

Referenced by get_clean_mod_name(), and gpu_ify_statement().

◆ loop_nests_to_outline

list loop_nests_to_outline

static

A simple phase that outlines parallel loops onto GPU.

Ronan.nosp@m..Ker.nosp@m.yell@.nosp@m.hpc-.nosp@m.proje.nosp@m.ct.c.nosp@m.om Store the loop nests found that meet the spec to be executed on a GPU. Use a list and not a set or hash_map to have always the same order

Definition at line 32 of file gpu-ify.c.

Referenced by gpu_ify(), and mark_loop_to_outline().

◆ wrapper_prefix

const char* wrapper_prefix = 0

static

Definition at line 39 of file gpu-ify.c.

Referenced by get_clean_mod_name(), and gpu_ify_statement().

Functions

Variables

Function Documentation

◆ build_outline_name()

◆ clean_prefix()

◆ get_clean_mod_name()

◆ gpu_ify()

◆ gpu_ify_statement()

◆ mark_loop_to_outline()

Variable Documentation

◆ fwrapper_prefix

◆ kernel_prefix

◆ launcher_prefix

◆ loop_nests_to_outline

◆ wrapper_prefix