IOR
utilities.c
Go to the documentation of this file.
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  */
4 /******************************************************************************\
5 * *
6 * Copyright (c) 2003, The Regents of the University of California *
7 * See the file COPYRIGHT for a complete copyright notice and license. *
8 * *
9 ********************************************************************************
10 *
11 * Additional utilities
12 *
13 \******************************************************************************/
14 
15 #ifdef HAVE_CONFIG_H
16 # include "config.h"
17 #endif
18 
19 #ifdef HAVE_GETCPU_SYSCALL
20 # define _GNU_SOURCE
21 # include <unistd.h>
22 # include <sys/syscall.h>
23 #endif
24 
25 #ifdef __linux__
26 # define _GNU_SOURCE /* Needed for O_DIRECT in fcntl */
27 #endif /* __linux__ */
28 
29 #include <stdarg.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <errno.h>
33 #include <fcntl.h>
34 #include <math.h> /* pow() */
35 #include <string.h>
36 #include <sys/stat.h>
37 #include <sys/types.h>
38 #include <time.h>
39 
40 #ifdef HAVE_CUDA
41 #include <cuda_runtime.h>
42 #endif
43 
44 #ifndef _WIN32
45 # include <regex.h>
46 # ifdef __sun /* SunOS does not support statfs(), instead uses statvfs() */
47 # include <sys/statvfs.h>
48 # elif (defined __APPLE__)
49 # include <sys/param.h>
50 # include <sys/mount.h>
51 # else /* ! __sun or __APPLE__ */
52 # include <sys/statfs.h>
53 # endif /* __sun */
54 # include <sys/time.h> /* gettimeofday() */
55 #endif
56 
57 #include "utilities.h"
58 #include "aiori.h"
59 #include "ior.h"
60 #include "ior-internal.h"
61 
62 #define RANDALGO_GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
63 
64 /************************** D E C L A R A T I O N S ***************************/
65 
66 extern int errno;
67 extern int numTasks;
68 
69 /* globals used by other files, also defined "extern" in utilities.h */
70 int rank = 0;
71 int rankOffset = 0;
72 int verbose = VERBOSE_0; /* verbose output */
73 MPI_Comm testComm = MPI_COMM_NULL;
74 FILE * out_logfile = NULL;
77 
78 /* local */
79 //int rand_state_init = 0;
80 //uint64_t rand_state = 0;
81 
82 /***************************** F U N C T I O N S ******************************/
83 
94 void update_write_memory_pattern(uint64_t item, char * buf, size_t bytes, int rand_seed, int pretendRank, ior_dataPacketType_e dataPacketType, ior_memory_flags type){
95  if (dataPacketType == DATA_TIMESTAMP || bytes < 8)
96  return;
97 
98 #ifdef HAVE_GPU_DIRECT
100  update_write_memory_pattern_gpu(item, buf, bytes, rand_seed, pretendRank, dataPacketType);
101  return;
102  }
103 #endif
104 
105  size_t size = bytes / sizeof(uint64_t);
106  uint64_t * buffi = (uint64_t*) buf;
107 
108  if (dataPacketType == DATA_RANDOM) {
109  uint64_t rand_state_local;
110  unsigned seed = rand_seed + pretendRank + item;
111  rand_state_local = rand_r(&seed);
112  for (size_t i = 0; i < size; i++) {
113  rand_state_local *= RANDALGO_GOLDEN_RATIO_PRIME;
114  rand_state_local >>= 3;
115  buffi[i] = rand_state_local;
116  }
117  return;
118  }
119 
120  /* DATA_INCOMPRESSIBLE and DATA_OFFSET */
121  int k = 1;
122  for(size_t i=0; i < size; i+=512, k++){
123  buffi[i] = ((uint32_t) item * k) | ((uint64_t) pretendRank) << 32;
124  }
125 }
126 
137 void generate_memory_pattern(char * buf, size_t bytes, int rand_seed, int pretendRank, ior_dataPacketType_e dataPacketType, ior_memory_flags type){
138 #ifdef HAVE_GPU_DIRECT
140  generate_memory_pattern_gpu(buf, bytes, rand_seed, pretendRank, dataPacketType);
141  return;
142  }
143 #endif
144  uint64_t * buffi = (uint64_t*) buf;
145  // first half of 64 bits use the rank
146  const size_t size = bytes / 8;
147  // the first 8 bytes of each 4k block are updated at runtime
148  for(size_t i=0; i < size; i++){
149  switch(dataPacketType){
150  case(DATA_RANDOM):
151  // Nothing to do, will work on updates
152  break;
153  case(DATA_INCOMPRESSIBLE):{
154  unsigned seed = rand_seed + pretendRank;
155  uint64_t hi = ((uint64_t) rand_r(& seed) << 32);
156  uint64_t lo = (uint64_t) rand_r(& seed);
157  buffi[i] = hi | lo;
158  break;
159  }case(DATA_OFFSET):{
160  }case(DATA_TIMESTAMP):{
161  buffi[i] = ((uint64_t) pretendRank) << 32 | rand_seed + i;
162  break;
163  }
164  }
165  }
166 
167  for(size_t i=size*8; i < bytes; i++){
168  buf[i] = (char) i;
169  }
170 }
171 
172 void invalidate_buffer_pattern(char * buffer, size_t bytes, ior_memory_flags type){
174 #ifdef HAVE_GPU_DIRECT
175  cudaMemset(buffer, 0x42, bytes > 512 ? 512 : bytes);
176 #endif
177  }else{
178  buffer[0] = ~buffer[0]; // changes the buffer, no memset to reduce the memory pressure
179  }
180 }
181 
182 int verify_memory_pattern(uint64_t item, char * buffer, size_t bytes, int rand_seed, int pretendRank, ior_dataPacketType_e dataPacketType, ior_memory_flags type){
183  int error = 0;
184 #ifdef HAVE_GPU_DIRECT
186  error = verify_memory_pattern_gpu(item, buffer, bytes, rand_seed, pretendRank, dataPacketType);
187  return error;
188  }
189 #endif
190  // always read all data to ensure that performance numbers stay the same
191  uint64_t * buffi = (uint64_t*) buffer;
192 
193  // the first 8 bytes are set to item number
194  int k=1;
195 
196  uint64_t rand_state_local;
197  unsigned seed = rand_seed + pretendRank + item;
198  rand_state_local = rand_r(&seed);
199  const size_t size = bytes / 8;
200  for(size_t i=0; i < size; i++){
201  uint64_t exp;
202 
203  switch(dataPacketType){
204  case(DATA_RANDOM):
205  rand_state_local *= RANDALGO_GOLDEN_RATIO_PRIME;
206  rand_state_local >>= 3;
207  exp = rand_state_local;
208  break;
209  case(DATA_INCOMPRESSIBLE):{
210  unsigned seed = rand_seed + pretendRank;
211  uint64_t hi = ((uint64_t) rand_r(& seed) << 32);
212  uint64_t lo = (uint64_t) rand_r(& seed);
213  exp = hi | lo;
214  break;
215  }case(DATA_OFFSET):{
216  }case(DATA_TIMESTAMP):{
217  exp = ((uint64_t) pretendRank) << 32 | rand_seed + i;
218  break;
219  }
220  }
221  if(i % 512 == 0 && (dataPacketType != DATA_TIMESTAMP) && dataPacketType != DATA_RANDOM){
222  exp = ((uint32_t) item * k) | ((uint64_t) pretendRank) << 32;
223  k++;
224  }
225  if(buffi[i] != exp){
226  error = 1;
227  }
228  }
229  for(size_t i=size*8; i < bytes; i++){
230  if(buffer[i] != (char) i){
231  error = 1;
232  }
233  }
234 
235  return error;
236 }
237 
238 void* safeMalloc(uint64_t size){
239  void * d = malloc(size);
240  if (d == NULL){
241  ERR("Could not malloc an array");
242  }
243  memset(d, 0, size);
244  return d;
245 }
246 
247 void FailMessage(int rank, const char *location, char *format, ...) {
248  char msg[4096];
249  va_list args;
250  va_start(args, format);
251  vsnprintf(msg, 4096, format, args);
252  va_end(args);
253  fprintf(out_logfile, "%s: Process %d: FAILED in %s, %s\n",
254  PrintTimestamp(), rank, location, msg);
255  fflush(out_logfile);
256  MPI_Abort(testComm, 1);
257 }
258 
259 size_t NodeMemoryStringToBytes(char *size_str)
260 {
261  int percent;
262  int rc;
263  long page_size;
264  long num_pages;
265  long long mem;
266 
267  rc = sscanf(size_str, " %d %% ", &percent);
268  if (rc == 0)
269  return (size_t) string_to_bytes(size_str);
270  if (percent > 100 || percent < 0)
271  ERR("percentage must be between 0 and 100");
272 
273 #ifdef HAVE_SYSCONF
274  page_size = sysconf(_SC_PAGESIZE);
275 #else
276  page_size = getpagesize();
277 #endif
278 
279 #ifdef _SC_PHYS_PAGES
280  num_pages = sysconf(_SC_PHYS_PAGES);
281  if (num_pages == -1)
282  ERR("sysconf(_SC_PHYS_PAGES) is not supported");
283 #else
284  ERR("sysconf(_SC_PHYS_PAGES) is not supported");
285 #endif
286  mem = page_size * num_pages;
287 
288  return mem / 100 * percent;
289 }
290 
292  switch(t) {
293  case '\0': return DATA_TIMESTAMP;
294  case 'i': /* Incompressible */
295  return DATA_INCOMPRESSIBLE;
296  case 't': /* timestamp */
297  return DATA_TIMESTAMP;
298  case 'o': /* offset packet */
299  return DATA_OFFSET;
300  case 'r': /* randomized blocks */
301  return DATA_RANDOM;
302  default:
303  ERRF("Unknown packet type \"%c\"; generic assumed\n", t);
304  return DATA_OFFSET;
305  }
306 }
307 
309  if (options->setTimeStampSignature){
310  options->incompressibleSeed = options->setTimeStampSignature;
311  }
312 
313  if (options->buffer_type && options->buffer_type[0] != 0){
314  options->dataPacketType = parsePacketType(options->buffer_type[0]);
315  }
316  if (options->memoryPerNodeStr){
318  }
319  const ior_aiori_t * backend = aiori_select(options->api);
320  if (backend == NULL)
321  ERR("Unrecognized I/O API");
322 
323  options->backend = backend;
324  /* copy the actual module options into the test */
325  options->backend_options = airoi_update_module_options(backend, global_options);
326  options->apiVersion = backend->get_version();
327 }
328 
329 /* Used in aiori-POSIX.c and aiori-PLFS.c
330  */
331 
332 void set_o_direct_flag(int *flag)
333 {
334 /* note that TRU64 needs O_DIRECTIO, SunOS uses directio(),
335  and everyone else needs O_DIRECT */
336 #ifndef O_DIRECT
337 # ifndef O_DIRECTIO
338  WARN("cannot use O_DIRECT");
339 # define O_DIRECT 000000
340 # else /* O_DIRECTIO */
341 # define O_DIRECT O_DIRECTIO
342 # endif /* not O_DIRECTIO */
343 #endif /* not O_DIRECT */
344 
345  *flag |= O_DIRECT;
346 }
347 
348 
349 /*
350  * Returns string containing the current time.
351  *
352  * NOTE: On some systems, MPI jobs hang while ctime() waits for a lock.
353  * This is true even though CurrentTimeString() is only called for rank==0.
354  * ctime_r() fixes this.
355  */
356 char *CurrentTimeString(void)
357 {
358  static time_t currentTime;
359  char* currentTimePtr;
360 
361  if ((currentTime = time(NULL)) == -1)
362  ERR("cannot get current time");
363 
364 #if (_POSIX_C_SOURCE >= 1 || _XOPEN_SOURCE || _BSD_SOURCE || _SVID_SOURCE || _POSIX_SOURCE)
365  static char threadSafeBuff[32]; /* "must be at least 26 characters long" */
366  if ((currentTimePtr = ctime_r(&currentTime, threadSafeBuff)) == NULL) {
367  ERR("cannot read current time");
368  }
369 #else
370  if ((currentTimePtr = ctime(&currentTime)) == NULL) {
371  ERR("cannot read current time");
372  }
373 #endif
374  /* ctime string ends in \n */
375  return (currentTimePtr);
376 }
377 
378 /*
379  * Dump transfer buffer.
380  */
381 void DumpBuffer(void *buffer,
382  size_t size) /* <size> in bytes */
383 {
384  size_t i, j;
385  IOR_size_t *dumpBuf = (IOR_size_t *)buffer;
386 
387  /* Turns out, IOR_size_t is unsigned long long, but we don't want
388  to assume that it must always be */
389  for (i = 0; i < ((size / sizeof(IOR_size_t)) / 4); i++) {
390  for (j = 0; j < 4; j++) {
391  fprintf(out_logfile, IOR_format" ", dumpBuf[4 * i + j]);
392  }
393  fprintf(out_logfile, "\n");
394  }
395  return;
396 } /* DumpBuffer() */
397 
398 /* a function that prints an int array where each index corresponds to a rank
399  and the value is whether that rank is on the same host as root.
400  Also returns 1 if rank 1 is on same host and 0 otherwise
401 */
402 int QueryNodeMapping(MPI_Comm comm, int print_nodemap) {
403  char localhost[MAX_PATHLEN], roothost[MAX_PATHLEN];
404  int num_ranks;
405  MPI_Comm_size(comm, &num_ranks);
406  int *node_map = (int*)malloc(sizeof(int) * num_ranks);
407  if ( ! node_map ) {
408  FAIL("malloc");
409  }
410  if (gethostname(localhost, MAX_PATHLEN) != 0) {
411  FAIL("gethostname()");
412  }
413  if (rank==0) {
414  strncpy(roothost,localhost,MAX_PATHLEN);
415  }
416 
417  /* have rank 0 broadcast out its hostname */
418  MPI_Bcast(roothost, MAX_PATHLEN, MPI_CHAR, 0, comm);
419  //printf("Rank %d received root host as %s\n", rank, roothost);
420  /* then every rank figures out whether it is same host as root and then gathers that */
421  int same_as_root = strcmp(roothost,localhost) == 0;
422  MPI_Gather( &same_as_root, 1, MPI_INT, node_map, 1, MPI_INT, 0, comm);
423  if ( print_nodemap && rank==0) {
424  fprintf( out_logfile, "Nodemap: " );
425  for ( int i = 0; i < num_ranks; i++ ) {
426  fprintf( out_logfile, "%d", node_map[i] );
427  }
428  fprintf( out_logfile, "\n" );
429  }
430  int ret = 1;
431  if(num_ranks>1)
432  ret = node_map[1] == 1;
433  MPI_Bcast(&ret, 1, MPI_INT, 0, comm);
434  free(node_map);
435  return ret;
436 }
437 
438 void initCUDA(int blockMapping, int rank, int numNodes, int tasksPerNode, int useGPUID){
439 #ifdef HAVE_CUDA
440  int device_count;
441  cudaError_t cret = cudaGetDeviceCount(& device_count);
442  if(cret != cudaSuccess){
443  ERRF("cudaGetDeviceCount() error: %d %s", (int) cret, cudaGetErrorString(cret));
444  }
445  //if (rank == 0){
446  // char val[20];
447  // sprintf(val, "%d", device_count);
448  // PrintKeyVal("cudaDevices", val);
449  //}
450  // if set to -1 use round robin per task
451  if(useGPUID == -1){
452  int device = 0;
453  if(blockMapping){
454  device = (rank % tasksPerNode) % device_count;
455  }else{
456  device = (rank / numNodes) % device_count;
457  }
458  cret = cudaSetDevice(device);
459  }else{
460  cret = cudaSetDevice(useGPUID);
461  }
462  if(cret != cudaSuccess){
463  WARNF("cudaSetDevice(%d) error: %s", useGPUID, cudaGetErrorString(cret));
464  }
465 #endif
466 }
467 
468 /*
469  * There is a more direct way to determine the node count in modern MPI
470  * versions so we use that if possible.
471  *
472  * For older versions we use a method which should still provide accurate
473  * results even if the total number of tasks is not evenly divisible by the
474  * tasks on node rank 0.
475  */
476 int GetNumNodes(MPI_Comm comm) {
477  if (getenv("IOR_FAKE_NODES")){
478  int numNodes = atoi(getenv("IOR_FAKE_NODES"));
479  int rank;
480  MPI_Comm_rank(comm, & rank);
481  if(rank == 0){
482  printf("Fake number of node: using %d\n", numNodes);
483  }
484  return numNodes;
485  }
486 #if MPI_VERSION >= 3
487  MPI_Comm shared_comm;
488  int shared_rank = 0;
489  int local_result = 0;
490  int numNodes = 0;
491 
492  MPI_CHECK(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm),
493  "MPI_Comm_split_type() error");
494  MPI_CHECK(MPI_Comm_rank(shared_comm, &shared_rank), "MPI_Comm_rank() error");
495  local_result = shared_rank == 0? 1 : 0;
496  MPI_CHECK(MPI_Allreduce(&local_result, &numNodes, 1, MPI_INT, MPI_SUM, comm),
497  "MPI_Allreduce() error");
498  MPI_CHECK(MPI_Comm_free(&shared_comm), "MPI_Comm_free() error");
499 
500  return numNodes;
501 #else
502  int numTasks = 0;
503  int numTasksOnNode0 = 0;
504 
505  numTasks = GetNumTasks(comm);
506  numTasksOnNode0 = GetNumTasksOnNode0(comm);
507 
508  return ((numTasks - 1) / numTasksOnNode0) + 1;
509 #endif
510 }
511 
512 
513 int GetNumTasks(MPI_Comm comm) {
514  int numTasks = 0;
515 
516  MPI_CHECK(MPI_Comm_size(comm, &numTasks), "cannot get number of tasks");
517 
518  return numTasks;
519 }
520 
521 
522 /*
523  * It's very important that this method provide the same result to every
524  * process as it's used for redistributing which jobs read from which files.
525  * It was renamed accordingly.
526  *
527  * If different nodes get different results from this method then jobs get
528  * redistributed unevenly and you no longer have a 1:1 relationship with some
529  * nodes reading multiple files while others read none.
530  *
531  * In the common case the number of tasks on each node (MPI_Comm_size on an
532  * MPI_COMM_TYPE_SHARED communicator) will be the same. However, there is
533  * nothing which guarantees this. It's valid to have, for example, 64 jobs
534  * across 4 systems which can run 20 jobs each. In that scenario you end up
535  * with 3 MPI_COMM_TYPE_SHARED groups of 20, and one group of 4.
536  *
537  * In the (MPI_VERSION < 3) implementation of this method consistency is
538  * ensured by asking specifically about the number of tasks on the node with
539  * rank 0. In the original implementation for (MPI_VERSION >= 3) this was
540  * broken by using the LOCAL process count which differed depending on which
541  * node you were on.
542  *
543  * This was corrected below by first splitting the comm into groups by node
544  * (MPI_COMM_TYPE_SHARED) and then having only the node with world rank 0 and
545  * shared rank 0 return the MPI_Comm_size of its shared subgroup. This yields
546  * the original consistent behavior no matter which node asks.
547  *
548  * In the common case where every node has the same number of tasks this
549  * method will return the same value it always has.
550  */
551 int GetNumTasksOnNode0(MPI_Comm comm) {
552  if (getenv("IOR_FAKE_TASK_PER_NODES")){
553  int tasksPerNode = atoi(getenv("IOR_FAKE_TASK_PER_NODES"));
554  int rank;
555  MPI_Comm_rank(comm, & rank);
556  if(rank == 0){
557  printf("Fake tasks per node: using %d\n", tasksPerNode);
558  }
559  return tasksPerNode;
560  }
561 #if MPI_VERSION >= 3
562  MPI_Comm shared_comm;
563  int shared_rank = 0;
564  int tasks_on_node_rank0 = 0;
565  int local_result = 0;
566 
567  MPI_CHECK(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm),
568  "MPI_Comm_split_type() error");
569  MPI_CHECK(MPI_Comm_rank(shared_comm, &shared_rank), "MPI_Comm_rank() error");
570  if (rank == 0 && shared_rank == 0) {
571  MPI_CHECK(MPI_Comm_size(shared_comm, &local_result), "MPI_Comm_size() error");
572  }
573  MPI_CHECK(MPI_Allreduce(&local_result, &tasks_on_node_rank0, 1, MPI_INT, MPI_SUM, comm),
574  "MPI_Allreduce() error");
575  MPI_CHECK(MPI_Comm_free(&shared_comm), "MPI_Comm_free() error");
576 
577  return tasks_on_node_rank0;
578 #else
579 /*
580  * This version employs the gethostname() call, rather than using
581  * MPI_Get_processor_name(). We are interested in knowing the number
582  * of tasks that share a file system client (I/O node, compute node,
583  * whatever that may be). However on machines like BlueGene/Q,
584  * MPI_Get_processor_name() uniquely identifies a cpu in a compute node,
585  * not the node where the I/O is function shipped to. gethostname()
586  * is assumed to identify the shared filesystem client in more situations.
587  */
588  int size;
589  MPI_Comm_size(comm, & size);
590  /* for debugging and testing */
591  char localhost[MAX_PATHLEN],
592  hostname[MAX_PATHLEN];
593  int count = 1,
594  i;
595  MPI_Status status;
596 
597  if (( rank == 0 ) && ( verbose >= 1 )) {
598  fprintf( out_logfile, "V-1: Entering count_tasks_per_node...\n" );
599  fflush( out_logfile );
600  }
601 
602  if (gethostname(localhost, MAX_PATHLEN) != 0) {
603  FAIL("gethostname()");
604  }
605  if (rank == 0) {
606  /* MPI_receive all hostnames, and compares them to the local hostname */
607  for (i = 0; i < size-1; i++) {
608  MPI_Recv(hostname, MAX_PATHLEN, MPI_CHAR, MPI_ANY_SOURCE,
609  MPI_ANY_TAG, comm, &status);
610  if (strcmp(hostname, localhost) == 0) {
611  count++;
612  }
613  }
614  } else {
615  /* MPI_send hostname to root node */
616  MPI_Send(localhost, MAX_PATHLEN, MPI_CHAR, 0, 0, comm);
617  }
618  MPI_Bcast(&count, 1, MPI_INT, 0, comm);
619 
620  return(count);
621 #endif
622 }
623 
624 
625 /*
626  * Extract key/value pair from hint string.
627  */
628 void ExtractHint(char *settingVal, char *valueVal, char *hintString)
629 {
630  char *settingPtr, *valuePtr, *tmpPtr2;
631 
632  /* find the value */
633  settingPtr = (char *)strtok(hintString, " =");
634  valuePtr = (char *)strtok(NULL, " =\t\r\n");
635  /* is this an MPI hint? */
636  tmpPtr2 = (char *) strstr(settingPtr, "IOR_HINT__MPI__");
637  if (settingPtr == tmpPtr2) {
638  settingPtr += strlen("IOR_HINT__MPI__");
639  } else {
640  tmpPtr2 = (char *) strstr(hintString, "IOR_HINT__GPFS__");
641  /* is it an GPFS hint? */
642  if (settingPtr == tmpPtr2) {
643  settingPtr += strlen("IOR_HINT__GPFS__");
644  }else{
645  fprintf(out_logfile, "WARNING: Unable to set unknown hint type (not implemented.)\n");
646  return;
647  }
648  }
649  strcpy(settingVal, settingPtr);
650  strcpy(valueVal, valuePtr);
651 }
652 
653 /*
654  * Set hints for MPIIO, HDF5, or NCMPI.
655  */
656 void SetHints(MPI_Info * mpiHints, char *hintsFileName)
657 {
658  char hintString[MAX_STR];
659  char settingVal[MAX_STR];
660  char valueVal[MAX_STR];
661  extern char **environ;
662  int i;
663  FILE *fd;
664 
665  /*
666  * This routine checks for hints from the environment and/or from the
667  * hints files. The hints are of the form:
668  * 'IOR_HINT__<layer>__<hint>=<value>', where <layer> is either 'MPI'
669  * or 'GPFS', <hint> is the full name of the hint to be set, and <value>
670  * is the hint value. E.g., 'setenv IOR_HINT__MPI__IBM_largeblock_io true'
671  * or 'IOR_HINT__GPFS__hint=value' in the hints file.
672  */
673  MPI_CHECK(MPI_Info_create(mpiHints), "cannot create info object");
674 
675  /* get hints from environment */
676  for (i = 0; environ[i] != NULL; i++) {
677  /* if this is an IOR_HINT, pass the hint to the info object */
678  if (strncmp(environ[i], "IOR_HINT", strlen("IOR_HINT")) == 0) {
679  strcpy(hintString, environ[i]);
680  ExtractHint(settingVal, valueVal, hintString);
681  MPI_CHECK(MPI_Info_set(*mpiHints, settingVal, valueVal),
682  "cannot set info object");
683  }
684  }
685 
686  /* get hints from hints file */
687  if (hintsFileName != NULL && strcmp(hintsFileName, "") != 0) {
688 
689  /* open the hint file */
690  fd = fopen(hintsFileName, "r");
691  if (fd == NULL) {
692  WARN("cannot open hints file");
693  } else {
694  /* iterate over hints file */
695  while (fgets(hintString, MAX_STR, fd) != NULL) {
696  if (strncmp
697  (hintString, "IOR_HINT",
698  strlen("IOR_HINT")) == 0) {
699  ExtractHint(settingVal, valueVal,
700  hintString);
701  MPI_CHECK(MPI_Info_set
702  (*mpiHints, settingVal,
703  valueVal),
704  "cannot set info object");
705  }
706  }
707  /* close the hints files */
708  if (fclose(fd) != 0)
709  ERR("cannot close hints file");
710  }
711  }
712 }
713 
714 /*
715  * Show all hints (key/value pairs) in an MPI_Info object.
716  */
717 void ShowHints(MPI_Info * mpiHints)
718 {
719  char key[MPI_MAX_INFO_VAL];
720  char value[MPI_MAX_INFO_VAL];
721  int flag, i, nkeys;
722 
723  MPI_CHECK(MPI_Info_get_nkeys(*mpiHints, &nkeys),
724  "cannot get info object keys");
725 
726  for (i = 0; i < nkeys; i++) {
727  MPI_CHECK(MPI_Info_get_nthkey(*mpiHints, i, key),
728  "cannot get info object key");
729  MPI_CHECK(MPI_Info_get(*mpiHints, key, MPI_MAX_INFO_VAL - 1,
730  value, &flag),
731  "cannot get info object value");
732  fprintf(out_logfile, "\t%s = %s\n", key, value);
733  }
734 }
735 
736 /*
737  * Takes a string of the form 64, 8m, 128k, 4g, etc. and converts to bytes.
738  */
740 {
741  IOR_offset_t size = 0;
742  char range;
743  int rc;
744 
745  rc = sscanf(size_str, "%lld%c", &size, &range);
746  if (rc == 2) {
747  switch ((int)range) {
748  case 'k':
749  case 'K':
750  size <<= 10;
751  break;
752  case 'm':
753  case 'M':
754  size <<= 20;
755  break;
756  case 'g':
757  case 'G':
758  size <<= 30;
759  break;
760  }
761  } else if (rc == 0) {
762  size = -1;
763  }
764  return (size);
765 }
766 
767 /*
768  * Displays size of file system and percent of data blocks and inodes used.
769  */
770 void ShowFileSystemSize(char * filename, const struct ior_aiori * backend, void * backend_options) // this might be converted to an AIORI call
771 {
772  ior_aiori_statfs_t stat;
773  if(! backend->statfs){
774  WARN("Backend doesn't implement statfs");
775  return;
776  }
777  int ret = backend->statfs(filename, & stat, backend_options);
778  if( ret != 0 ){
779  WARN("Backend returned error during statfs");
780  return;
781  }
782  long long int totalFileSystemSize;
783  long long int freeFileSystemSize;
784  long long int totalInodes;
785  long long int freeInodes;
786  double totalFileSystemSizeHR;
787  double usedFileSystemPercentage;
788  double usedInodePercentage;
789  char *fileSystemUnitStr;
790 
791  totalFileSystemSize = stat.f_blocks * stat.f_bsize;
792  freeFileSystemSize = stat.f_bfree * stat.f_bsize;
793  usedFileSystemPercentage = (1 - ((double)freeFileSystemSize / (double)totalFileSystemSize)) * 100;
794  totalFileSystemSizeHR = (double)totalFileSystemSize / (double)(1<<30);
795 
796  /* inodes */
797  totalInodes = stat.f_files;
798  freeInodes = stat.f_ffree;
799  usedInodePercentage = (1 - ((double)freeInodes / (double)totalInodes)) * 100;
800 
801  fileSystemUnitStr = "GiB";
802  if (totalFileSystemSizeHR > 1024) {
803  totalFileSystemSizeHR = (double)totalFileSystemSize / (double)((long long)1<<40);
804  fileSystemUnitStr = "TiB";
805  }
807  fprintf(out_resultfile, "%-20s: %s\n", "Path", filename);
808  fprintf(out_resultfile, "%-20s: %.1f %s Used FS: %2.1f%% ",
809  "FS", totalFileSystemSizeHR, fileSystemUnitStr,
810  usedFileSystemPercentage);
811  fprintf(out_resultfile, "Inodes: %.1f Mi Used Inodes: %2.1f%%\n",
812  (double)totalInodes / (double)(1<<20),
813  usedInodePercentage);
814  fflush(out_logfile);
815  }else if(outputFormat == OUTPUT_JSON){
816  fprintf(out_resultfile, " , \"Path\": \"%s\",", filename);
817  fprintf(out_resultfile, "\"Capacity\": \"%.1f %s\", \"Used Capacity\": \"%2.1f%%\",",
818  totalFileSystemSizeHR, fileSystemUnitStr,
819  usedFileSystemPercentage);
820  fprintf(out_resultfile, "\"Inodes\": \"%.1f Mi\", \"Used Inodes\" : \"%2.1f%%\"\n",
821  (double)totalInodes / (double)(1<<20),
822  usedInodePercentage);
823  }else if(outputFormat == OUTPUT_CSV){
824 
825  }
826 
827  return;
828 }
829 
830 /*
831  * Return match of regular expression -- 0 is failure, 1 is success.
832  */
833 int Regex(char *string, char *pattern)
834 {
835  int retValue = 0;
836 #ifndef _WIN32 /* Okay to always not match */
837  regex_t regEx;
838  regmatch_t regMatch;
839 
840  regcomp(&regEx, pattern, REG_EXTENDED);
841  if (regexec(&regEx, string, 1, &regMatch, 0) == 0) {
842  retValue = 1;
843  }
844  regfree(&regEx);
845 #endif
846 
847  return (retValue);
848 }
849 
850 /*
851  * System info for Windows.
852  */
853 #ifdef _WIN32
854 int uname(struct utsname *name)
855 {
856  DWORD nodeNameSize = sizeof(name->nodename) - 1;
857 
858  memset(name, 0, sizeof(struct utsname));
859  if (!GetComputerNameEx
860  (ComputerNameDnsFullyQualified, name->nodename, &nodeNameSize))
861  ERR("GetComputerNameEx failed");
862 
863  strncpy(name->sysname, "Windows", sizeof(name->sysname) - 1);
864  /* FIXME - these should be easy to fetch */
865  strncpy(name->release, "-", sizeof(name->release) - 1);
866  strncpy(name->version, "-", sizeof(name->version) - 1);
867  strncpy(name->machine, "-", sizeof(name->machine) - 1);
868  return 0;
869 }
870 #endif /* _WIN32 */
871 
872 /*
873  * Get time stamp. Use MPI_Timer() unless _NO_MPI_TIMER is defined,
874  * in which case use gettimeofday().
875  */
876 double GetTimeStamp(void)
877 {
878  double timeVal;
879  struct timeval timer;
880 
881  if (gettimeofday(&timer, (struct timezone *)NULL) != 0)
882  ERR("cannot use gettimeofday()");
883  timeVal = (double)timer.tv_sec + ((double)timer.tv_usec / 1000000);
884 
885  return (timeVal);
886 }
887 
888 /*
889  * Determine any spread (range) between node times.
890  * Obsolete
891  */
892 static double TimeDeviation(MPI_Comm com)
893 {
894  double timestamp;
895  double min = 0;
896  double max = 0;
897  double roottimestamp;
898 
899  MPI_CHECK(MPI_Barrier(com), "barrier error");
900  timestamp = GetTimeStamp();
901  MPI_CHECK(MPI_Reduce(&timestamp, &min, 1, MPI_DOUBLE,
902  MPI_MIN, 0, com),
903  "cannot reduce tasks' times");
904  MPI_CHECK(MPI_Reduce(&timestamp, &max, 1, MPI_DOUBLE,
905  MPI_MAX, 0, com),
906  "cannot reduce tasks' times");
907 
908  /* delta between individual nodes' time and root node's time */
909  roottimestamp = timestamp;
910  MPI_CHECK(MPI_Bcast(&roottimestamp, 1, MPI_DOUBLE, 0, com),
911  "cannot broadcast root's time");
912  // wall_clock_delta = timestamp - roottimestamp;
913 
914  return max - min;
915 }
916 
917 void init_clock(MPI_Comm com){
918 
919 }
920 
921 char * PrintTimestamp() {
922  static char datestring[80];
923  time_t cur_timestamp;
924 
925  if (( rank == 0 ) && ( verbose >= 1 )) {
926  fprintf( out_logfile, "V-1: Entering PrintTimestamp...\n" );
927  }
928 
929  fflush(out_logfile);
930  cur_timestamp = time(NULL);
931  strftime(datestring, 80, "%m/%d/%Y %T", localtime(&cur_timestamp));
932 
933  return datestring;
934 }
935 
936 int64_t ReadStoneWallingIterations(char * const filename, MPI_Comm com){
937  long long data;
938  if(rank != 0){
939  MPI_Bcast( & data, 1, MPI_LONG_LONG_INT, 0, com);
940  return data;
941  }else{
942  FILE * out = fopen(filename, "r");
943  if (out == NULL){
944  data = -1;
945  MPI_Bcast( & data, 1, MPI_LONG_LONG_INT, 0, com);
946  return data;
947  }
948  int ret = fscanf(out, "%lld", & data);
949  if (ret != 1){
950  fclose(out);
951  return -1;
952  }
953  fclose(out);
954  MPI_Bcast( & data, 1, MPI_LONG_LONG_INT, 0, com);
955  return data;
956  }
957 }
958 
959 void StoreStoneWallingIterations(char * const filename, int64_t count){
960  if(rank != 0){
961  return;
962  }
963  FILE * out = fopen(filename, "w");
964  if (out == NULL){
965  FAIL("Cannot write to the stonewalling file!");
966  }
967  fprintf(out, "%lld", (long long) count);
968  fclose(out);
969 }
970 
971 /*
972  * Sleep for 'delay' seconds.
973  */
974 void DelaySecs(int delay){
975  if (rank == 0 && delay > 0) {
976  if (verbose >= VERBOSE_1)
977  fprintf(out_logfile, "delaying %d seconds . . .\n", delay);
978  sleep(delay);
979  }
980 }
981 
982 
983 /*
984  * Convert IOR_offset_t value to human readable string. This routine uses a
985  * statically-allocated buffer internally and so is not re-entrant.
986  */
987 char *HumanReadable(IOR_offset_t value, int base)
988 {
989  static char valueStr[MAX_STR];
990  IOR_offset_t m = 0, g = 0, t = 0;
991  char m_str[8], g_str[8], t_str[8];
992 
993  if (base == BASE_TWO) {
994  m = MEBIBYTE;
995  g = GIBIBYTE;
996  t = GIBIBYTE * 1024llu;
997  strcpy(m_str, "MiB");
998  strcpy(g_str, "GiB");
999  strcpy(t_str, "TiB");
1000  } else if (base == BASE_TEN) {
1001  m = MEGABYTE;
1002  g = GIGABYTE;
1003  t = GIGABYTE * 1000llu;
1004  strcpy(m_str, "MB");
1005  strcpy(g_str, "GB");
1006  strcpy(t_str, "TB");
1007  }
1008 
1009  if (value >= t) {
1010  if (value % t) {
1011  snprintf(valueStr, MAX_STR-1, "%.2f %s",
1012  (double)((double)value / t), t_str);
1013  } else {
1014  snprintf(valueStr, MAX_STR-1, "%d %s", (int)(value / t), t_str);
1015  }
1016  }else if (value >= g) {
1017  if (value % g) {
1018  snprintf(valueStr, MAX_STR-1, "%.2f %s",
1019  (double)((double)value / g), g_str);
1020  } else {
1021  snprintf(valueStr, MAX_STR-1, "%d %s", (int)(value / g), g_str);
1022  }
1023  } else if (value >= m) {
1024  if (value % m) {
1025  snprintf(valueStr, MAX_STR-1, "%.2f %s",
1026  (double)((double)value / m), m_str);
1027  } else {
1028  snprintf(valueStr, MAX_STR-1, "%d %s", (int)(value / m), m_str);
1029  }
1030  } else if (value >= 0) {
1031  snprintf(valueStr, MAX_STR-1, "%d bytes", (int)value);
1032  } else {
1033  snprintf(valueStr, MAX_STR-1, "-");
1034  }
1035  return valueStr;
1036 }
1037 
1038 #if defined(HAVE_GETCPU_SYSCALL)
1039 // Assume we aren't worried about thread/process migration.
1040 // Test on Intel systems and see if we can get rid of the architecture specificity
1041 // of the code.
1042 unsigned long GetProcessorAndCore(int *chip, int *core){
1043  return syscall(SYS_getcpu, core, chip, NULL);
1044 }
1045 #elif defined(HAVE_RDTSCP_ASM)
1046 // We're on an intel processor and use the
1047 // rdtscp instruction.
1048 unsigned long GetProcessorAndCore(int *chip, int *core){
1049  unsigned long a,d,c;
1050  __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
1051  *chip = (c & 0xFFF000)>>12;
1052  *core = c & 0xFFF;
1053  return ((unsigned long)a) | (((unsigned long)d) << 32);;
1054 }
1055 #else
1056 // TODO: Add in AMD function
1057 unsigned long GetProcessorAndCore(int *chip, int *core){
1058 #warning GetProcessorAndCore is implemented as a dummy
1059  *chip = 0;
1060  *core = 0;
1061  return 1;
1062 }
1063 #endif
1064 
1065 
1066 
1067 /*
1068  * Allocate a page-aligned (required by O_DIRECT) buffer.
1069  */
1070 void *aligned_buffer_alloc(size_t size, ior_memory_flags type)
1071 {
1072  size_t pageMask;
1073  char *buf, *tmp;
1074  char *aligned;
1075 
1077 #ifdef HAVE_CUDA
1078  // use unified memory here to allow drop-in-replacement
1079  if (cudaMallocManaged((void**) & buf, size, cudaMemAttachGlobal) != cudaSuccess){
1080  ERR("Cannot allocate buffer on GPU");
1081  }
1082  return buf;
1083 #else
1084  ERR("No CUDA supported, cannot allocate on the GPU");
1085 #endif
1086  }else if(type == IOR_MEMORY_TYPE_GPU_DEVICE_ONLY){
1087 #ifdef HAVE_GPU_DIRECT
1088  if (cudaMalloc((void**) & buf, size) != cudaSuccess){
1089  ERR("Cannot allocate buffer on GPU");
1090  }
1091  return buf;
1092 #else
1093  ERR("No GPUDirect supported, cannot allocate on the GPU");
1094 #endif
1095  }
1096 
1097 #ifdef HAVE_SYSCONF
1098  long pageSize = sysconf(_SC_PAGESIZE);
1099 #else
1100  size_t pageSize = getpagesize();
1101 #endif
1102 
1103  pageMask = pageSize - 1;
1104  buf = safeMalloc(size + pageSize + sizeof(void *));
1105  /* find the alinged buffer */
1106  tmp = buf + sizeof(char *);
1107  aligned = tmp + pageSize - ((size_t) tmp & pageMask);
1108  /* write a pointer to the original malloc()ed buffer into the bytes
1109  preceding "aligned", so that the aligned buffer can later be free()ed */
1110  tmp = aligned - sizeof(void *);
1111  *(void **)tmp = buf;
1112 
1113  return (void *)aligned;
1114 }
1115 
1116 /*
1117  * Free a buffer allocated by aligned_buffer_alloc().
1118  */
1120 {
1121  if(gpu){
1122 #ifdef HAVE_CUDA
1123  if (cudaFree(buf) != cudaSuccess){
1124  WARN("Cannot free buffer on GPU");
1125  }
1126  return;
1127 #else
1128  ERR("No CUDA supported, cannot free on the GPU");
1129 #endif
1130  }
1131  free(*(void **)((char *)buf - sizeof(char *)));
1132 }
int verify_memory_pattern_gpu(uint64_t item, char *buffer, size_t bytes, int rand_seed, int pretendRank, ior_dataPacketType_e dataPacketType)
void invalidate_buffer_pattern(char *buffer, size_t bytes, ior_memory_flags type)
Definition: utilities.c:172
char * HumanReadable(IOR_offset_t value, int base)
Definition: utilities.c:987
#define ERRF(FORMAT,...)
Definition: aiori-debug.h:67
MPI_Comm testComm
Definition: utilities.c:73
int GetNumTasks(MPI_Comm comm)
Definition: utilities.c:513
#define MEBIBYTE
Definition: iordef.h:92
uint64_t f_blocks
Definition: aiori.h:53
unsigned long GetProcessorAndCore(int *chip, int *core)
Definition: utilities.c:1057
uint64_t f_bfree
Definition: aiori.h:54
void ShowHints(MPI_Info *mpiHints)
Definition: utilities.c:717
unsigned int incompressibleSeed
Definition: ior.h:134
#define VERBOSE_0
Definition: iordef.h:106
void * airoi_update_module_options(const ior_aiori_t *backend, options_all_t *opt)
Definition: aiori.c:96
CURLcode rc
Definition: aiori-S3-4c.c:111
char *(* get_version)(void)
Definition: aiori.h:101
#define RANDALGO_GOLDEN_RATIO_PRIME
Definition: utilities.c:62
int rankOffset
Definition: utilities.c:71
OutputFormat_t
Definition: iordef.h:69
int64_t ReadStoneWallingIterations(char *const filename, MPI_Comm com)
Definition: utilities.c:936
enum OutputFormat_t outputFormat
Definition: utilities.c:76
int(* statfs)(const char *, ior_aiori_statfs_t *, aiori_mod_opt_t *module_options)
Definition: aiori.h:104
size_t memoryPerNode
Definition: ior.h:137
#define min(a, b)
Definition: md-workbench.c:27
int QueryNodeMapping(MPI_Comm comm, int print_nodemap)
Definition: utilities.c:402
uint64_t f_ffree
Definition: aiori.h:57
#define FAIL(...)
Definition: aiori-debug.h:16
char * apiVersion
Definition: ior.h:77
int numTasks
int setTimeStampSignature
Definition: ior.h:131
IOR_offset_t StringToBytes(char *size_str)
Definition: utilities.c:739
#define GIBIBYTE
Definition: iordef.h:93
#define MPI_CHECK(MPI_STATUS, MSG)
Definition: aiori-debug.h:97
int verbose
Definition: utilities.c:72
void * backend_options
Definition: ior.h:143
char * PrintTimestamp()
Definition: utilities.c:921
const ior_aiori_t * aiori_select(const char *api)
Definition: aiori.c:240
ior_dataPacketType_e dataPacketType
Definition: ior.h:141
ior_dataPacketType_e parsePacketType(char t)
Definition: utilities.c:291
uint64_t f_files
Definition: aiori.h:56
static double TimeDeviation(MPI_Comm com)
Definition: utilities.c:892
static option_help options[]
Definition: aiori-CEPHFS.c:59
uint64_t f_bsize
Definition: aiori.h:52
void init_clock(MPI_Comm com)
Definition: utilities.c:917
char * CurrentTimeString(void)
Definition: utilities.c:356
#define WARN(MSG)
Definition: aiori-debug.h:45
void updateParsedOptions(IOR_param_t *options, options_all_t *global_options)
Definition: utilities.c:308
int GetNumNodes(MPI_Comm comm)
Definition: utilities.c:476
void initCUDA(int blockMapping, int rank, int numNodes, int tasksPerNode, int useGPUID)
Definition: utilities.c:438
int rank
Definition: utilities.c:70
FILE * out_resultfile
Definition: utilities.c:75
double GetTimeStamp(void)
Definition: utilities.c:876
static const ior_aiori_t * backend
Definition: ior.c:61
char ** environ
void StoreStoneWallingIterations(char *const filename, int64_t count)
Definition: utilities.c:959
static options_all_t * global_options
Definition: parse_options.c:41
void update_write_memory_pattern(uint64_t item, char *buf, size_t bytes, int rand_seed, int pretendRank, ior_dataPacketType_e dataPacketType, ior_memory_flags type)
Definition: utilities.c:94
#define WARNF(FORMAT,...)
Definition: aiori-debug.h:30
#define GIGABYTE
Definition: iordef.h:89
long long int IOR_size_t
Definition: iordef.h:124
char * buffer_type
Definition: ior.h:140
void generate_memory_pattern(char *buf, size_t bytes, int rand_seed, int pretendRank, ior_dataPacketType_e dataPacketType, ior_memory_flags type)
Definition: utilities.c:137
int64_t string_to_bytes(char *size_str)
Definition: option.c:30
#define BASE_TWO
Definition: iordef.h:96
#define MAX_STR
Definition: iordef.h:113
void ExtractHint(char *settingVal, char *valueVal, char *hintString)
Definition: utilities.c:628
#define MAX_PATHLEN
Definition: utilities.h:31
#define O_DIRECT
void generate_memory_pattern_gpu(char *buf, size_t bytes, int rand_seed, int rank, ior_dataPacketType_e dataPacketType)
void ShowFileSystemSize(char *filename, const struct ior_aiori *backend, void *backend_options)
Definition: utilities.c:770
int errno
const struct ior_aiori * backend
Definition: ior.h:73
void SetHints(MPI_Info *mpiHints, char *hintsFileName)
Definition: utilities.c:656
void set_o_direct_flag(int *flag)
Definition: utilities.c:332
#define MEGABYTE
Definition: iordef.h:88
void FailMessage(int rank, const char *location, char *format,...)
Definition: utilities.c:247
#define ERR(MSG)
Definition: aiori-debug.h:75
#define BASE_TEN
Definition: iordef.h:97
int verify_memory_pattern(uint64_t item, char *buffer, size_t bytes, int rand_seed, int pretendRank, ior_dataPacketType_e dataPacketType, ior_memory_flags type)
Definition: utilities.c:182
void DelaySecs(int delay)
Definition: utilities.c:974
#define VERBOSE_1
Definition: iordef.h:107
int Regex(char *string, char *pattern)
Definition: utilities.c:833
char * api
Definition: ior.h:76
size_t NodeMemoryStringToBytes(char *size_str)
Definition: utilities.c:259
ior_dataPacketType_e
Definition: iordef.h:22
void aligned_buffer_free(void *buf, ior_memory_flags gpu)
Definition: utilities.c:1119
ior_memory_flags
Definition: iordef.h:29
#define IOR_format
Definition: iordef.h:126
void update_write_memory_pattern_gpu(uint64_t item, char *buf, size_t bytes, int rand_seed, int rank, ior_dataPacketType_e dataPacketType)
void DumpBuffer(void *buffer, size_t size)
Definition: utilities.c:381
char * memoryPerNodeStr
Definition: ior.h:138
FILE * out_logfile
Definition: utilities.c:74
long long int IOR_offset_t
Definition: iordef.h:123
int GetNumTasksOnNode0(MPI_Comm comm)
Definition: utilities.c:551
void * safeMalloc(uint64_t size)
Definition: utilities.c:238
#define NULL
Definition: iordef.h:84
void * aligned_buffer_alloc(size_t size, ior_memory_flags type)
Definition: utilities.c:1070