1 #define _POSIX_C_SOURCE 199309L 23 #define DIRMODE S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IXGRP|S_IROTH|S_IXOTH 25 #define CHECK_MPI_RET(ret) if (ret != MPI_SUCCESS){ printf("Unexpected error in MPI on Line %d\n", __LINE__);} 26 #define LLU (long long unsigned) 27 #define min(a,b) (a < b ? a : b) 29 #define oprintf(...) do { fprintf(o.logfile, __VA_ARGS__); fflush(o.logfile); } while(0); 136 sprintf(out_name,
"%s/%d_%d",
o.
prefix, n, d);
140 sprintf(out_name,
"%s/%d_%d/file-%d",
o.
prefix, n, d, i);
154 .packetTypeStr =
"t",
155 .run_info_file =
"md-workbench.status",
167 double end = cur + waittime;
173 w.tv_sec = (time_t) (waittime);
174 w.tv_nsec = (long) ((waittime - w.tv_sec) * 1000 * 1000 * 1000);
175 nanosleep(& w,
NULL);
190 float curtime = start - phase_start_timer;
192 results[pos].
runtime = (float) op_time;
194 if (op_time > *max_time){
197 *out_op_time = op_time;
202 printf(
"phase\t\td name\tcreate\tdelete\tob nam\tcreate\tread\tstat\tdelete\tt_inc_b\tt_no_bar\tthp\tmax_t\n");
211 for(
int i=0; i <
o.
size; i++){
220 for(
int i=0; i <
o.
size; i++){
221 sum += (mean - arr[i])*(mean - arr[i]);
223 return sqrt(sum / (
o.
size-1));
229 for(
int i=0; i <
o.
size; i++){
230 min = (arr[i] <
min) ? arr[i] : min;
231 max = (arr[i] > max) ? arr[i] : max;
254 sprintf(buff,
"%s \t%d\t%d\t%d\t%d\t%d\t%d\t%.3fs\t%.3fs\t%.2f MiB/s %.4e", name, p->
dset_create.
suc, p->
dset_delete.
suc, p->
obj_create.
suc, p->
obj_read.
suc, p->
obj_stat.
suc, p->
obj_delete.
suc, p->
t, t, tp, p->
max_op_time);
262 pos += sprintf(buff,
"%s process max:%.2fs ", name, t);
264 pos += sprintf(buff + pos,
"min:%.2fs mean: %.2fs balance:%.1f stddev:%.1f ", r_min, r_mean, r_min/r_max * 100.0, r_std);
266 int ioops_per_iter = 4;
276 pos += sprintf(buff + pos,
"rate:%.1f iops/s objects:%d rate:%.1f obj/s tp:%.1f MiB/s op-max:%.4es",
289 pos += sprintf(buff + pos,
"rate:%.1f iops/s dsets: %d objects:%d rate:%.3f dset/s rate:%.1f obj/s tp:%.1f MiB/s op-max:%.4es",
300 pos += sprintf(buff + pos,
"rate:%.1f iops/s objects:%d dsets: %d rate:%.1f obj/s rate:%.3f dset/s op-max:%.4es",
309 pos = sprintf(buff,
"%s: unknown phase", name);
324 pos += sprintf(buff + pos,
" (%d errs", errs);
326 pos += sprintf(buff + pos,
"!!!)" );
328 pos += sprintf(buff + pos,
")" );
337 pos += sprintf(buff + pos,
" read(%.4es, %.4es, %.4es, %.4es, %.4es, %.4es, %.4es)", stat.
min, stat.
q1, stat.
median, stat.
q3, stat.
q90, stat.
q99, stat.
max);
341 pos += sprintf(buff + pos,
" stat(%.4es, %.4es, %.4es, %.4es, %.4es, %.4es, %.4es)", stat.
min, stat.
q1, stat.
median, stat.
q3, stat.
q90, stat.
q99, stat.
max);
345 pos += sprintf(buff + pos,
" create(%.4es, %.4es, %.4es, %.4es, %.4es, %.4es, %.4es)", stat.
min, stat.
q1, stat.
median, stat.
q3, stat.
q90, stat.
q99, stat.
max);
349 pos += sprintf(buff + pos,
" delete(%.4es, %.4es, %.4es, %.4es, %.4es, %.4es, %.4es)", stat.
min, stat.
q1, stat.
median, stat.
q3, stat.
q90, stat.
q99, stat.
max);
359 int pos = round(quantile * (repeats - 1) + 0.49);
360 assert(pos < repeats);
370 memcpy(global_times, times, repeats * 2 *
sizeof(
float));
372 for(
int i=1; i <
o.
size; i++){
374 ret = MPI_Recv(& global_times[count], max_repeats*2, MPI_FLOAT, i, 888,
o.
com, & status);
376 MPI_Get_count(& status, MPI_FLOAT, & cnt);
380 ret = MPI_Send(times, repeats * 2, MPI_FLOAT, 0, 888,
o.
com);
391 FILE * f = fopen(file,
"w+");
393 ERRF(
"%d: Error writing to latency file: %s",
o.
rank, file);
396 fprintf(f,
"time,runtime\n");
397 for(
size_t i = 0; i < repeats; i++){
398 fprintf(f,
"%.7f,%.4e\n", times[i].time_since_app_start, times[i].runtime);
406 if(repeats % 2 == 0){
425 if(strcmp(name,
"benchmark") == 0){
433 ret = MPI_Reduce(& p->
t, & g_stat.
t, 2, MPI_DOUBLE, MPI_MAX, 0,
o.
com);
436 g_stat.
t_all = (
double*) malloc(
sizeof(
double) *
o.
size);
438 ret = MPI_Gather(& p->
t, 1, MPI_DOUBLE, g_stat.
t_all, 1, MPI_DOUBLE, 0,
o.
com);
451 if(strcmp(name,
"precreate") == 0){
457 }
else if(strcmp(name,
"cleanup") == 0){
463 }
else if(strcmp(name,
"benchmark") == 0){
501 for(
int i=1; i <
o.
size; i++){
502 MPI_Recv(buff,
MAX_PATHLEN, MPI_CHAR, i, 4711,
o.
com, MPI_STATUS_IGNORE);
558 ERRF(
"%d: Error while creating the dset: %s",
o.
rank, dset);
570 for(
int f=current_index; f <
o.
precreate; f++){
577 if (
NULL == aiori_fh){
578 FAIL(
"Unable to open file %s", obj_name);
586 ERRF(
"%d: Error while creating the obj: %s",
o.
rank, obj_name);
594 oprintf(
"%d: write %s:%s (%d) pretend: %d\n",
o.
rank, dset, obj_name, ret,
o.
rank);
609 int start_index = *current_index_p;
610 int total_num =
o.
num;
613 double phase_allreduce_time = 0;
616 for(f=0; f < total_num; f++){
617 float bench_runtime = 0;
620 struct stat stat_buf;
621 const int prevFile = f + start_index;
625 readRank = readRank < 0 ? readRank +
o.
size : readRank;
644 ERRF(
"%d: Error while stating the obj: %s",
o.
rank, obj_name);
651 oprintf(
"%d: read %s pretend: %d\n",
o.
rank, obj_name, readRank);
656 if (
NULL == aiori_fh){
657 FAIL(
"Unable to open file %s", obj_name);
671 WARNF(
"%d: Error while reading the obj: %s",
o.
rank, obj_name);
696 const int newFileIndex =
o.
precreate + prevFile;
701 if (
NULL != aiori_fh){
710 ERRF(
"%d: Error while creating the obj: %s\n",
o.
rank, obj_name);
716 ERRF(
"%d: Error while creating the obj: %s",
o.
rank, obj_name);
718 WARNF(
"Unable to open file %s", obj_name);
727 oprintf(
"%d: write %s (%d) pretend: %d\n",
o.
rank, obj_name, ret, writeRank);
739 armed_stone_wall = 0;
743 int ret = MPI_Allreduce(& cur_pos, & total_num, 1, MPI_INT, MPI_MAX,
o.
com);
748 oprintf(
"stonewall wear out %fs (%d iter)\n", bench_runtime, total_num);
758 int ret = MPI_Allreduce(& f, & total_num, 1, MPI_INT, MPI_MAX,
o.
com);
770 *current_index_p += f;
802 oprintf(
"Unable to remove directory %s\n", dset);
812 {
'O',
"offset",
"Offset in o.ranks between writers and readers. Writers and readers should be located on different nodes.",
OPTION_OPTIONAL_ARGUMENT,
'd', &
o.
offset},
831 {
'w',
"stonewall-timer",
"Stop each benchmark iteration after the specified seconds (if not used with -W this leads to process-specific progress!)",
OPTION_OPTIONAL_ARGUMENT,
'd', &
o.
stonewall_timer},
832 {
'W',
"stonewall-wear-out",
"Stop with stonewall after specified time and use a soft wear-out phase -- all processes perform the same number of iterations",
OPTION_FLAG,
'd', &
o.
stonewall_timer_wear_out},
836 {0,
"allocateBufferOnGPU",
"Allocate I/O buffers on the GPU: X=1 uses managed memory - verifications are run on CPU; X=2 managed memory - verifications on GPU; X=3 device memory with verifications on GPU.",
OPTION_OPTIONAL_ARGUMENT,
'd', &
o.
gpuMemoryFlags},
838 #ifdef HAVE_GPU_DIRECT 839 {0,
"gpuDirect",
"Allocate I/O buffers on the GPU and use gpuDirect to store data; this option is incompatible with any option requiring CPU access to data.",
OPTION_FLAG,
'd', &
o.
gpuDirect},
844 {0,
"read-only",
"Run read-only during benchmarking phase (no deletes/writes), probably use with -2",
OPTION_FLAG,
'd', &
o.
read_only},
854 time_t now = time(0);
855 strftime (buff, 100,
"%Y-%m-%d %H:%M:%S", localtime (&now));
867 ret = fscanf(f,
"pos: %d", & position);
874 ret = MPI_Bcast( & position, 1, MPI_INT, 0,
o.
com );
887 fprintf(f,
"pos: %d\n", position);
894 char * limit_memory_P =
NULL;
906 for(
int i=1; i < argc; i++){
919 ERR(
"Unrecognized I/O API");
922 ERR(
"Backend doesn't support MDWorbench");
934 WARN(
"Dangerous option combination: and benchmark phase (-2) using with stonewall option (-w) without stonewall wear-out will lead to files that cannot be cleaned up using the cleanup phase(-3). Also multiple iterations are problematic.");
959 int current_index = 0;
974 oprintf(
"WARNING: num > precreate, this may cause the situation that no objects are available to read\n");
991 double t_bench_start;
1033 for(
int r=0; r <= 6; r++){
1067 oprintf(
"Total runtime: %.0fs time: ", t_all);
void invalidate_buffer_pattern(char *buffer, size_t bytes, ior_memory_flags type)
mdworkbench_results_t * md_workbench_run(int argc, char **argv, MPI_Comm world_com, FILE *out_logfile)
void run_benchmark(phase_stat_t *s, int *current_index_p)
static void def_dset_name(char *out_name, int n, int d)
time_statistics_t stats_read
time_statistics_t stats_create
static int sum_err(phase_stat_t *p)
static float add_timed_result(double start, double phase_start_timer, time_result_t *results, size_t pos, double *max_time, double *out_op_time)
void * airoi_update_module_options(const ior_aiori_t *backend, options_all_t *opt)
int option_parse(int argc, char **argv, options_all_t *opt_all)
float relative_waiting_factor
struct benchmark_options o
time_statistics_t stats_delete
static void statistics_minmax(int count, double *arr, double *out_min, double *out_max)
time_result_t * time_create
void run_precreate(phase_stat_t *s, int current_index)
void(* delete)(char *, aiori_mod_opt_t *module_options)
int(* mkdir)(const char *path, mode_t mode, aiori_mod_opt_t *module_options)
int QueryNodeMapping(MPI_Comm comm, int print_nodemap)
static uint64_t aggregate_timers(int repeats, int max_repeats, time_result_t *times, time_result_t *global_times)
time_statistics_t stats_stat
static void compute_histogram(const char *name, time_result_t *times, time_statistics_t *stats, size_t repeats, int writeLatencyFile)
int stonewall_timer_wear_out
ior_dataPacketType_e dataPacketType
static option_help options[]
static int compare_floats(time_result_t *x, time_result_t *y)
int(* rmdir)(const char *path, aiori_mod_opt_t *module_options)
static double statistics_mean(int count, double *arr)
const ior_aiori_t * aiori_select(const char *api)
time_result_t * time_stat
ior_dataPacketType_e parsePacketType(char t)
int(* check_params)(aiori_mod_opt_t *)
static double statistics_std_dev(int count, double *arr)
void run_cleanup(phase_stat_t *s, int start_index)
void init_clock(MPI_Comm com)
void(* initialize)(aiori_mod_opt_t *options)
ior_memory_flags gpuMemoryFlags
float time_since_app_start
int GetNumNodes(MPI_Comm comm)
void initCUDA(int blockMapping, int rank, int numNodes, int tasksPerNode, int useGPUID)
time_statistics_t stats_create
void(* xfer_hints)(aiori_xfer_hint_t *params)
void(* close)(aiori_fd_t *, aiori_mod_opt_t *module_options)
time_result_t * time_read
int(* stat)(const char *path, struct stat *buf, aiori_mod_opt_t *module_options)
mdworkbench_results_t * results
options_all_t * airoi_create_all_module_options(option_help *global_options)
double GetTimeStamp(void)
static int return_position()
int adaptive_waiting_mode
static void store_position(int position)
#define CHECK_MPI_RET(ret)
ior_aiori_t const * backend
aiori_fd_t *(* create)(char *, int iorflags, aiori_mod_opt_t *)
IOR_offset_t(* xfer)(int access, aiori_fd_t *, IOR_size_t *, IOR_offset_t size, IOR_offset_t offset, aiori_mod_opt_t *module_options)
static void def_obj_name(char *out_name, int n, int d, int i)
static void end_phase(const char *name, phase_stat_t *p)
static options_all_t * global_options
void update_write_memory_pattern(uint64_t item, char *buf, size_t bytes, int rand_seed, int pretendRank, ior_dataPacketType_e dataPacketType, ior_memory_flags type)
time_statistics_t stats_delete
time_result_t * time_delete
#define WARNF(FORMAT,...)
mdworkbench_result_t result[]
void(* finalize)(aiori_mod_opt_t *options)
static double runtime_quantile(int repeats, time_result_t *times, float quantile)
void generate_memory_pattern(char *buf, size_t bytes, int rand_seed, int pretendRank, ior_dataPacketType_e dataPacketType, ior_memory_flags type)
static void print_detailed_stat_header()
static void mdw_wait(double runtime)
time_statistics_t stats_stat
aiori_fd_t *(* open)(char *, int iorflags, aiori_mod_opt_t *)
int ignore_precreate_errors
int verify_memory_pattern(uint64_t item, char *buffer, size_t bytes, int rand_seed, int pretendRank, ior_dataPacketType_e dataPacketType, ior_memory_flags type)
static void print_p_stat(char *buff, const char *name, phase_stat_t *p, double t, int print_global)
void aligned_buffer_free(void *buf, ior_memory_flags gpu)
uint64_t start_item_number
static void init_stats(phase_stat_t *p, size_t repeats)
int GetNumTasksOnNode0(MPI_Comm comm)
time_statistics_t stats_read
char * latency_file_prefix
void * aligned_buffer_alloc(size_t size, ior_memory_flags type)