IOR
aiori-POSIX.c
Go to the documentation of this file.
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  */
4 /******************************************************************************\
5 * *
6 * Copyright (c) 2003, The Regents of the University of California *
7 * See the file COPYRIGHT for a complete copyright notice and license. *
8 * *
9 ********************************************************************************
10 *
11 * Implement of abstract I/O interface for POSIX.
12 *
13 \******************************************************************************/
14 
15 #ifdef HAVE_CONFIG_H
16 # include "config.h"
17 #endif
18 
19 #include <stdio.h>
20 #include <stdlib.h>
21 
22 #ifdef __linux__
23 # include <sys/ioctl.h> /* necessary for: */
24 # define __USE_GNU /* O_DIRECT and */
25 # include <fcntl.h> /* IO operations */
26 # undef __USE_GNU
27 #endif /* __linux__ */
28 
29 #include <errno.h>
30 #include <unistd.h>
31 #include <fcntl.h> /* IO operations */
32 #include <sys/stat.h>
33 #include <assert.h>
34 
35 
36 #ifdef HAVE_LUSTRE_USER
37 # ifdef HAVE_LINUX_LUSTRE_LUSTRE_USER_H
38 # include <linux/lustre/lustre_user.h>
39 # elif defined(HAVE_LUSTRE_LUSTRE_USER_H)
40 # include <lustre/lustre_user.h>
41 # endif
42 #endif /* HAVE_LUSTRE_USER */
43 
44 #ifdef HAVE_GPFS_H
45 # include <gpfs.h>
46 #endif
47 #ifdef HAVE_GPFS_FCNTL_H
48 # include <gpfs_fcntl.h>
49 #endif
50 
51 #ifdef HAVE_BEEGFS_BEEGFS_H
52 # include <beegfs/beegfs.h>
53 # include <dirent.h>
54 # include <libgen.h>
55 #endif
56 
57 #include "ior.h"
58 #include "aiori.h"
59 #include "iordef.h"
60 #include "utilities.h"
61 
62 #include "aiori-POSIX.h"
63 
64 #ifdef HAVE_GPU_DIRECT
65 typedef long long loff_t;
66 # include <cuda_runtime.h>
67 # include <cufile.h>
68 #endif
69 
70 typedef struct {
71  int fd;
72 #ifdef HAVE_GPU_DIRECT
73  CUfileHandle_t cf_handle;
74 #endif
75 } posix_fd;
76 
77 
78 #ifndef open64 /* necessary for TRU64 -- */
79 # define open64 open /* unlikely, but may pose */
80 #endif /* not open64 */ /* conflicting prototypes */
81 
82 #ifndef lseek64 /* necessary for TRU64 -- */
83 # define lseek64 lseek /* unlikely, but may pose */
84 #endif /* not lseek64 */ /* conflicting prototypes */
85 
86 #ifndef O_BINARY /* Required on Windows */
87 # define O_BINARY 0
88 #endif
89 
90 #ifdef HAVE_GPU_DIRECT
91 static const char* cuFileGetErrorString(CUfileError_t status){
92  if(IS_CUDA_ERR(status)){
93  return cudaGetErrorString(status.err);
94  }
95  return strerror(status.err);
96 }
97 
98 static void init_cufile(posix_fd * pfd){
99  CUfileDescr_t cf_descr = (CUfileDescr_t){
100  .handle.fd = pfd->fd,
101  .type = CU_FILE_HANDLE_TYPE_OPAQUE_FD
102  };
103  CUfileError_t status = cuFileHandleRegister(& pfd->cf_handle, & cf_descr);
104  if(status.err != CU_FILE_SUCCESS){
105  WARNF("Could not register handle %s", cuFileGetErrorString(status));
106  }
107 }
108 #endif
109 
110 /**************************** P R O T O T Y P E S *****************************/
112 static void POSIX_Finalize(aiori_mod_opt_t * options);
113 
116 
117 option_help * POSIX_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_opt_t * init_values){
118  posix_options_t * o = malloc(sizeof(posix_options_t));
119 
120  if (init_values != NULL){
121  memcpy(o, init_values, sizeof(posix_options_t));
122  }else{
123  memset(o, 0, sizeof(posix_options_t));
124  o->direct_io = 0;
125  o->lustre_stripe_count = -1;
126  o->lustre_start_ost = -1;
127  o->beegfs_numTargets = -1;
128  o->beegfs_chunkSize = -1;
129  }
130 
131  *init_backend_options = (aiori_mod_opt_t*) o;
132 
133  option_help h [] = {
134  {0, "posix.odirect", "Direct I/O Mode", OPTION_FLAG, 'd', & o->direct_io},
135  {0, "posix.rangelocks", "Use range locks (read locks for read ops)", OPTION_FLAG, 'd', & o->range_locks},
136 #ifdef HAVE_BEEGFS_BEEGFS_H
137  {0, "posix.beegfs.NumTargets", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->beegfs_numTargets},
138  {0, "posix.beegfs.ChunkSize", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->beegfs_chunkSize},
139 #endif
140 #ifdef HAVE_GPFS_FCNTL_H
141  {0, "posix.gpfs.hintaccess", "", OPTION_FLAG, 'd', & o->gpfs_hint_access},
142  {0, "posix.gpfs.releasetoken", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->gpfs_release_token},
143 #ifdef HAVE_GPFSFINEGRAINWRITESHARING_T
144  {0, "posix.gpfs.finegrainwritesharing", " Enable fine grain write sharing", OPTION_FLAG, 'd', & o->gpfs_finegrain_writesharing},
145  {0, "posix.gpfs.finegrainreadsharing", " Enable fine grain read sharing", OPTION_FLAG, 'd', & o->gpfs_finegrain_readsharing},
146 #endif
147 #ifdef HAVE_GPFSCREATESHARING_T
148  {0, "posix.gpfs.createsharing", " Enable efficient file creation in a shared directory", OPTION_FLAG, 'd', & o->gpfs_createsharing},
149 #endif
150 #endif // HAVE_GPFS_FCNTL_H
151 #ifdef HAVE_LUSTRE_USER
152  {0, "posix.lustre.stripecount", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_stripe_count},
153  {0, "posix.lustre.stripesize", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_stripe_size},
154  {0, "posix.lustre.startost", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_start_ost},
155  {0, "posix.lustre.ignorelocks", "", OPTION_FLAG, 'd', & o->lustre_ignore_locks},
156 #endif /* HAVE_LUSTRE_USER */
157 #ifdef HAVE_GPU_DIRECT
158  {0, "gpuDirect", "allocate I/O buffers on the GPU", OPTION_FLAG, 'd', & o->gpuDirect},
159 #endif
161  };
162  option_help * help = malloc(sizeof(h));
163  memcpy(help, h, sizeof(h));
164  return help;
165 }
166 
167 
168 /************************** D E C L A R A T I O N S ***************************/
169 
170 
172  .name = "POSIX",
173  .name_legacy = NULL,
174  .initialize = POSIX_Initialize,
175  .finalize = POSIX_Finalize,
176  .create = POSIX_Create,
177  .mknod = POSIX_Mknod,
178  .open = POSIX_Open,
179  .xfer = POSIX_Xfer,
180  .close = POSIX_Close,
181  .delete = POSIX_Delete,
182  .xfer_hints = POSIX_xfer_hints,
183  .get_version = aiori_get_version,
184  .fsync = POSIX_Fsync,
185  .get_file_size = POSIX_GetFileSize,
186  .statfs = aiori_posix_statfs,
187  .mkdir = aiori_posix_mkdir,
188  .rmdir = aiori_posix_rmdir,
189  .rename = POSIX_Rename,
190  .access = aiori_posix_access,
191  .stat = aiori_posix_stat,
192  .get_options = POSIX_options,
193  .enable_mdtest = true,
194  .sync = POSIX_Sync,
195  .check_params = POSIX_check_params
196 };
197 
198 /***************************** F U N C T I O N S ******************************/
199 
201 
203  hints = params;
204 }
205 
207  posix_options_t * o = (posix_options_t*) param;
208  if (o->beegfs_chunkSize != -1 && (!ISPOWEROFTWO(o->beegfs_chunkSize) || o->beegfs_chunkSize < (1<<16)))
209  ERR("beegfsChunkSize must be a power of two and >64k");
210  if(o->lustre_stripe_count != -1 || o->lustre_stripe_size != 0)
211  o->lustre_set_striping = 1;
212  if(o->gpuDirect && ! o->direct_io){
213  ERR("GPUDirect required direct I/O to be used!");
214  }
215 #ifndef HAVE_GPU_DIRECT
216  if(o->gpuDirect){
217  ERR("GPUDirect support is not compiled");
218  }
219 #endif
220  return 0;
221 }
222 
223 #ifdef HAVE_GPFS_FCNTL_H
224 void gpfs_free_all_locks(int fd)
225 {
226  int rc;
227  struct {
228  gpfsFcntlHeader_t header;
229  gpfsFreeRange_t release;
230  } release_all;
231  release_all.header.totalLength = sizeof(release_all);
232  release_all.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
233  release_all.header.fcntlReserved = 0;
234 
235  release_all.release.structLen = sizeof(release_all.release);
236  release_all.release.structType = GPFS_FREE_RANGE;
237  release_all.release.start = 0;
238  release_all.release.length = 0;
239 
240  rc = gpfs_fcntl(fd, &release_all);
241  if (verbose >= VERBOSE_0 && rc != 0) {
242  WARNF("gpfs_fcntl(%d, ...) release all locks hint failed.", fd);
243  }
244 }
245 void gpfs_access_start(int fd, IOR_offset_t length, IOR_offset_t offset, int access)
246 {
247  int rc;
248  struct {
249  gpfsFcntlHeader_t header;
250  gpfsAccessRange_t access;
251  } take_locks;
252 
253  take_locks.header.totalLength = sizeof(take_locks);
254  take_locks.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
255  take_locks.header.fcntlReserved = 0;
256 
257  take_locks.access.structLen = sizeof(take_locks.access);
258  take_locks.access.structType = GPFS_ACCESS_RANGE;
259  take_locks.access.start = offset;
260  take_locks.access.length = length;
261  take_locks.access.isWrite = (access == WRITE);
262 
263  rc = gpfs_fcntl(fd, &take_locks);
264  if (verbose >= VERBOSE_2 && rc != 0) {
265  WARNF("gpfs_fcntl(%d, ...) access range hint failed.", fd);
266  }
267 }
268 
269 void gpfs_access_end(int fd, IOR_offset_t length, IOR_offset_t offset, int access)
270 {
271  int rc;
272  struct {
273  gpfsFcntlHeader_t header;
274  gpfsFreeRange_t free;
275  } free_locks;
276 
277 
278  free_locks.header.totalLength = sizeof(free_locks);
279  free_locks.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
280  free_locks.header.fcntlReserved = 0;
281 
282  free_locks.free.structLen = sizeof(free_locks.free);
283  free_locks.free.structType = GPFS_FREE_RANGE;
284  free_locks.free.start = offset;
285  free_locks.free.length = length;
286 
287  rc = gpfs_fcntl(fd, &free_locks);
288  if (verbose >= VERBOSE_2 && rc != 0) {
289  WARNF("gpfs_fcntl(%d, ...) free range hint failed.", fd);
290  }
291 }
292 
293 #ifdef HAVE_GPFSFINEGRAINWRITESHARING_T
294 /* This hint optimizes the performance of small strided
295  writes to a shared file from a parallel application */
296 void gpfs_fineGrainWriteSharing(int fd)
297 {
298  struct
299  {
300  gpfsFcntlHeader_t header;
301  gpfsFineGrainWriteSharing_t write;
302  } sharingHint;
303  int rc;
304 
305  sharingHint.header.totalLength = sizeof(sharingHint);
306  sharingHint.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
307  sharingHint.header.fcntlReserved = 0;
308 
309  sharingHint.write.structLen = sizeof(sharingHint.write);
310  sharingHint.write.structType = GPFS_FINE_GRAIN_WRITE_SHARING;
311  sharingHint.write.fineGrainWriteSharing = 1;
312  sharingHint.write.taskId = -1;
313  sharingHint.write.totalTasks = -1;
314  sharingHint.write.recordSize = -1;
315 
316  rc = gpfs_fcntl(fd, &sharingHint);
317  if (verbose >= VERBOSE_2 && rc != 0) {
318  WARNF("gpfs_fcntl(%d, ...) fine grain write sharing hint failed.", fd);
319  }
320 }
321 
322 /* This hint optimizes the performance of small strided
323  reads from a shared file from a parallel application */
324 void gpfs_fineGrainReadSharing(int fd)
325 {
326  struct
327  {
328  gpfsFcntlHeader_t header;
329 #ifdef HAVE_GPFSFINEGRAINREADSHARING_T
330  gpfsFineGrainReadSharing_t read;
331 #else
332  gpfsPrefetch_t read;
333 #endif
334  } sharingHint;
335  int rc;
336 
337  sharingHint.header.totalLength = sizeof(sharingHint);
338  sharingHint.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
339  sharingHint.header.fcntlReserved = 0;
340 
341  sharingHint.read.structLen = sizeof(sharingHint.read);
342 #ifdef HAVE_GPFSFINEGRAINREADSHARING_T
343  sharingHint.read.structType = GPFS_FINE_GRAIN_READ_SHARING;
344  sharingHint.read.fineGrainReadSharing = 1;
345 #else
346  sharingHint.read.structType = GPFS_PREFETCH;
347  sharingHint.read.prefetchEnableRead = 0;
348  sharingHint.read.prefetchEnableWrite = 1;
349 #endif
350 
351  rc = gpfs_fcntl(fd, &sharingHint);
352  if (verbose >= VERBOSE_2 && rc != 0) {
353  WARNF("gpfs_fcntl(%d, ...) fine grain read sharing hint failed.", fd);
354  }
355 }
356 #endif
357 #endif
358 
359 #ifdef HAVE_BEEGFS_BEEGFS_H
360 
361 int mkTempInDir(char* dirPath)
362 {
363  unsigned long len = strlen(dirPath) + 8;
364  char* tmpfilename = (char*)malloc(sizeof (char)*len+1);
365  snprintf(tmpfilename, len, "%s/XXXXXX", dirPath);
366 
367  int fd = mkstemp(tmpfilename);
368  unlink(tmpfilename);
369  free(tmpfilename);
370 
371  return fd;
372 }
373 
374 bool beegfs_getStriping(char* dirPath, u_int16_t* numTargetsOut, unsigned* chunkSizeOut)
375 {
376  bool retVal = false;
377 
378  int fd = mkTempInDir(dirPath);
379  if (fd) {
380  unsigned stripePattern = 0;
381  retVal = beegfs_getStripeInfo(fd, &stripePattern, chunkSizeOut, numTargetsOut);
382  close(fd);
383  }
384 
385  return retVal;
386 }
387 
388 bool beegfs_isOptionSet(int opt) {
389  return opt != -1;
390 }
391 
392 bool beegfs_compatibleFileExists(char* filepath, int numTargets, int chunkSize)
393 {
394  int fd = open(filepath, O_RDWR);
395 
396  if (fd == -1)
397  return false;
398 
399  unsigned read_stripePattern = 0;
400  u_int16_t read_numTargets = 0;
401  int read_chunkSize = 0;
402 
403  bool retVal = beegfs_getStripeInfo(fd, &read_stripePattern, &read_chunkSize, &read_numTargets);
404 
405  close(fd);
406 
407  return retVal && read_numTargets == numTargets && read_chunkSize == chunkSize;
408 }
409 
410 /*
411  * Create a file on a BeeGFS file system with striping parameters
412  */
413 bool beegfs_createFilePath(char* filepath, mode_t mode, int numTargets, int chunkSize)
414 {
415  bool retVal = false;
416  char* dirTmp = strdup(filepath);
417  char* dir = dirname(dirTmp);
418  DIR* parentDirS = opendir(dir);
419  if (!parentDirS) {
420  ERRF("Failed to get directory: %s", dir);
421  }
422  else
423  {
424  int parentDirFd = dirfd(parentDirS);
425  if (parentDirFd < 0)
426  {
427  ERRF("Failed to get directory descriptor: %s", dir);
428  }
429  else
430  {
431  bool isBeegfs = beegfs_testIsBeeGFS(parentDirFd);
432  if (!isBeegfs)
433  {
434  WARN("Not a BeeGFS file system");
435  }
436  else
437  {
438  if ( !beegfs_isOptionSet(numTargets)
439  || !beegfs_isOptionSet(chunkSize)) {
440  u_int16_t defaultNumTargets = 0;
441  unsigned defaultChunkSize = 0;
442  bool haveDefaults = beegfs_getStriping(dir,
443  &defaultNumTargets,
444  &defaultChunkSize);
445  if (!haveDefaults)
446  ERR("Failed to get default BeeGFS striping values");
447 
448  numTargets = beegfs_isOptionSet(numTargets) ?
449  numTargets : defaultNumTargets;
450  chunkSize = beegfs_isOptionSet(chunkSize) ?
451  chunkSize : defaultChunkSize;
452  }
453 
454  char* filenameTmp = strdup(filepath);
455  char* filename = basename(filepath);
456  bool isFileCreated = beegfs_compatibleFileExists(filepath, numTargets, chunkSize)
457  || beegfs_createFile(parentDirFd, filename,
458  mode, numTargets, chunkSize);
459  if (!isFileCreated)
460  ERR("Could not create file");
461  retVal = true;
462  free(filenameTmp);
463  }
464  }
465  closedir(parentDirS);
466  }
467  free(dirTmp);
468  return retVal;
469 }
470 #endif /* HAVE_BEEGFS_BEEGFS_H */
471 
472 
473 #ifdef HAVE_LUSTRE_USER
474 void lustre_disable_file_locks(const int fd) {
475  int lustre_ioctl_flags = LL_FILE_IGNORE_LOCK;
476  if (verbose >= VERBOSE_1) {
477  INFO("** Disabling lustre range locking **\n");
478  }
479  if (ioctl(fd, LL_IOC_SETFLAGS, &lustre_ioctl_flags) == -1) {
480  ERRF("ioctl(%d, LL_IOC_SETFLAGS, ...) failed", fd);
481  }
482 }
483 #endif /* HAVE_LUSTRE_USER */
484 
485 /*
486  * Create and open a file through the POSIX interface.
487  */
488 aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * param)
489 {
490  int fd_oflag = O_BINARY;
491  int mode = 0664;
492  posix_fd * pfd = safeMalloc(sizeof(posix_fd));
493  posix_options_t * o = (posix_options_t*) param;
494  if (o->direct_io == TRUE){
495  set_o_direct_flag(& fd_oflag);
496  }
497 
498  if(hints->dryRun)
499  return (aiori_fd_t*) 0;
500 
501 #ifdef HAVE_LUSTRE_USER
502 /* Add a #define for FASYNC if not available, as it forms part of
503  * the Lustre O_LOV_DELAY_CREATE definition. */
504 #ifndef FASYNC
505 #define FASYNC 00020000 /* fcntl, for BSD compatibility */
506 #endif
507  if (o->lustre_set_striping) {
508  /* In the single-shared-file case, task 0 has to create the
509  file with the Lustre striping options before any other
510  processes open the file */
511  if (!hints->filePerProc && rank != 0) {
512  MPI_CHECK(MPI_Barrier(testComm), "barrier error");
513  fd_oflag |= O_RDWR;
514  pfd->fd = open64(testFileName, fd_oflag, mode);
515  if (pfd->fd < 0){
516  ERRF("open64(\"%s\", %d, %#o) failed. Error: %s",
517  testFileName, fd_oflag, mode, strerror(errno));
518  }
519  } else {
520  struct lov_user_md opts = { 0 };
521 
522  /* Setup Lustre IOCTL striping pattern structure */
523  opts.lmm_magic = LOV_USER_MAGIC;
524  opts.lmm_stripe_size = o->lustre_stripe_size;
525  opts.lmm_stripe_offset = o->lustre_start_ost;
526  opts.lmm_stripe_count = o->lustre_stripe_count;
527 
528  /* File needs to be opened O_EXCL because we cannot set
529  * Lustre striping information on a pre-existing file.*/
530 
531  fd_oflag |= O_CREAT | O_EXCL | O_RDWR | O_LOV_DELAY_CREATE;
532  pfd->fd = open64(testFileName, fd_oflag, mode);
533  if (pfd->fd < 0) {
534  ERRF("Unable to open '%s': %s\n",
535  testFileName, strerror(errno));
536  } else if (ioctl(pfd->fd, LL_IOC_LOV_SETSTRIPE, &opts)) {
537  char *errmsg = "stripe already set";
538  if (errno != EEXIST && errno != EALREADY)
539  errmsg = strerror(errno);
540  ERRF("Error on ioctl for '%s' (%d): %s\n",
541  testFileName, pfd->fd, errmsg);
542  }
543  if (!hints->filePerProc)
544  MPI_CHECK(MPI_Barrier(testComm),
545  "barrier error");
546  }
547  } else {
548 #endif /* HAVE_LUSTRE_USER */
549 
550  fd_oflag |= O_CREAT | O_RDWR;
551 
552 #ifdef HAVE_BEEGFS_BEEGFS_H
553  if (beegfs_isOptionSet(o->beegfs_chunkSize)
554  || beegfs_isOptionSet(o->beegfs_numTargets)) {
555  bool result = beegfs_createFilePath(testFileName,
556  mode,
558  o->beegfs_chunkSize);
559  if (result) {
560  fd_oflag &= ~O_CREAT;
561  } else {
562  WARN("BeeGFS tuning failed");
563  }
564  }
565 #endif /* HAVE_BEEGFS_BEEGFS_H */
566 
567  pfd->fd = open64(testFileName, fd_oflag, mode);
568  if (pfd->fd < 0){
569  ERRF("open64(\"%s\", %d, %#o) failed. Error: %s",
570  testFileName, fd_oflag, mode, strerror(errno));
571  }
572 
573 #ifdef HAVE_LUSTRE_USER
574  }
575 
576  if (o->lustre_ignore_locks) {
577  lustre_disable_file_locks(pfd->fd);
578  }
579 #endif /* HAVE_LUSTRE_USER */
580 
581 #ifdef HAVE_GPFS_FCNTL_H
582  /* in the single shared file case, immediately release all locks, with
583  * the intent that we can avoid some byte range lock revocation:
584  * everyone will be writing/reading from individual regions */
585  if (o->gpfs_release_token ) {
586  gpfs_free_all_locks(pfd->fd);
587  }
588 #ifdef HAVE_GPFSFINEGRAINWRITESHARING_T
589  /* Enable fine grain write sharing */
591  gpfs_fineGrainWriteSharing(pfd->fd);
592  }
593 #endif
594 #endif
595 #ifdef HAVE_GPU_DIRECT
596  if(o->gpuDirect){
597  init_cufile(pfd);
598  }
599 #endif
600  return (aiori_fd_t*) pfd;
601 }
602 
603 /*
604  * Create a file through mknod interface.
605  */
606 int POSIX_Mknod(char *testFileName)
607 {
608  int ret;
609 
610  ret = mknod(testFileName, S_IFREG | S_IRUSR, 0);
611  if (ret < 0)
612  ERR("mknod failed");
613 
614  return ret;
615 }
616 
617 /*
618  * Open a file through the POSIX interface.
619  */
620 aiori_fd_t *POSIX_Open(char *testFileName, int flags, aiori_mod_opt_t * param)
621 {
622  int fd_oflag = O_BINARY;
623  if(flags & IOR_RDONLY){
624  fd_oflag |= O_RDONLY;
625  }else if(flags & IOR_WRONLY){
626  fd_oflag |= O_WRONLY;
627  }else{
628  fd_oflag |= O_RDWR;
629  }
630  posix_fd * pfd = safeMalloc(sizeof(posix_fd));
631  posix_options_t * o = (posix_options_t*) param;
632  if (o->direct_io == TRUE){
633  set_o_direct_flag(&fd_oflag);
634  }
635 
636  if(hints->dryRun)
637  return (aiori_fd_t*) 0;
638 
639  pfd->fd = open64(testFileName, fd_oflag);
640  if (pfd->fd < 0)
641  ERRF("open64(\"%s\", %d) failed: %s", testFileName, fd_oflag, strerror(errno));
642 
643 #ifdef HAVE_LUSTRE_USER
644  if (o->lustre_ignore_locks) {
645  lustre_disable_file_locks(pfd->fd);
646  }
647 #endif /* HAVE_LUSTRE_USER */
648 
649 #ifdef HAVE_GPFS_FCNTL_H
650  if(o->gpfs_release_token) {
651  gpfs_free_all_locks(pfd->fd);
652  }
653 #ifdef HAVE_GPFSFINEGRAINWRITESHARING_T
654  /* Enable fine grain read sharing */
656  gpfs_fineGrainReadSharing(pfd->fd);
657  }
658 #endif
659 #endif
660 #ifdef HAVE_GPU_DIRECT
661  if(o->gpuDirect){
662  init_cufile(pfd);
663  }
664 #endif
665  return (aiori_fd_t*) pfd;
666 }
667 
668 /*
669  * Write or read access to file using the POSIX interface.
670  */
671 static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer,
672  IOR_offset_t length, IOR_offset_t offset, aiori_mod_opt_t * param)
673 {
674  int xferRetries = 0;
675  long long remaining = (long long)length;
676  char *ptr = (char *)buffer;
677  long long rc;
678  int fd;
679  posix_options_t * o = (posix_options_t*) param;
680 
681  if(hints->dryRun)
682  return length;
683 
684  posix_fd * pfd = (posix_fd *) file;
685  fd = pfd->fd;
686 
687 #ifdef HAVE_GPFS_FCNTL_H
688  if (o->gpfs_hint_access) {
689  gpfs_access_start(fd, length, offset, access);
690  }
691 #endif
692 
693 
694  /* seek to offset */
695  if (lseek64(fd, offset, SEEK_SET) == -1)
696  ERRF("lseek64(%d, %lld, SEEK_SET) failed", fd, offset);
697  off_t mem_offset = 0;
698 
699  if(o->range_locks){
700  struct flock lck = {
701  .l_whence = SEEK_SET,
702  .l_start = offset,
703  .l_len = remaining,
704  .l_type = access == WRITE ? F_WRLCK : F_RDLCK,
705  };
706  if(fcntl(fd, F_SETLKW, &lck) != 0){
707  WARN("Error with F_SETLKW");
708  }
709  }
710  while (remaining > 0) {
711  /* write/read file */
712  if (access == WRITE) { /* WRITE */
713  if (verbose >= VERBOSE_4) {
714  INFOF("task %d writing to offset %lld\n",
715  rank,
716  offset + length - remaining);
717  }
718 #ifdef HAVE_GPU_DIRECT
719  if(o->gpuDirect){
720  rc = cuFileWrite(pfd->cf_handle, ptr, remaining, offset + mem_offset, mem_offset);
721  }else{
722 #endif
723  rc = write(fd, ptr, remaining);
724 #ifdef HAVE_GPU_DIRECT
725  }
726 #endif
727  if (rc < 0){
728  WARNF("write(%d, %p, %lld) failed %s", fd, (void*)ptr, remaining, strerror(errno));
729  }
730  if (hints->fsyncPerWrite == TRUE){
731  POSIX_Fsync((aiori_fd_t*) &fd, param);
732  }
733  } else { /* READ or CHECK */
734  if (verbose >= VERBOSE_4) {
735  INFOF("task %d reading from offset %lld\n",
736  rank,
737  offset + length - remaining);
738  }
739 #ifdef HAVE_GPU_DIRECT
740  if(o->gpuDirect){
741  rc = cuFileRead(pfd->cf_handle, ptr, remaining, offset + mem_offset, mem_offset);
742  }else{
743 #endif
744  rc = read(fd, ptr, remaining);
745 #ifdef HAVE_GPU_DIRECT
746  }
747 #endif
748  if (rc == 0){
749  WARNF("read(%d, %p, %lld) returned EOF prematurely", fd, (void*)ptr, remaining);
750  return length - remaining;
751  }
752 
753  if (rc < 0){
754  WARNF("read(%d, %p, %lld) failed %s", fd, (void*)ptr, remaining, strerror(errno));
755  return length - remaining;
756  }
757  }
758  if (rc < remaining) {
759  WARNF("task %d, partial %s, %lld of %lld bytes at offset %lld\n",
760  rank,
761  access == WRITE ? "write()" : "read()",
762  rc, remaining,
763  offset + length - remaining);
764  if (xferRetries > MAX_RETRY || hints->singleXferAttempt){
765  WARN("too many retries -- aborting");
766  return length - remaining;
767  }
768  }
769  assert(rc >= 0);
770  assert(rc <= remaining);
771  remaining -= rc;
772  ptr += rc;
773  mem_offset += rc;
774  xferRetries++;
775  }
776  if(o->range_locks){
777  struct flock lck = {
778  .l_whence = SEEK_SET,
779  .l_start = offset,
780  .l_len = length,
781  .l_type = F_UNLCK,
782  };
783  if(fcntl(fd, F_SETLK, &lck) != 0){
784  WARN("Error with F_UNLCK");
785  }
786  }
787 #ifdef HAVE_GPFS_FCNTL_H
788  if (o->gpfs_hint_access) {
789  gpfs_access_end(fd, length, offset, access);
790  }
791 #endif
792  return (length);
793 }
794 
796 {
797  int fd = ((posix_fd*) afd)->fd;
798  if (fsync(fd) != 0)
799  WARNF("fsync(%d) failed", fd);
800 }
801 
802 
804 {
805  int ret = system("sync");
806  if (ret != 0){
807  FAIL("Error executing the sync command, ensure it exists.");
808  }
809 }
810 
811 
812 /*
813  * Close a file through the POSIX interface.
814  */
816 {
817  if(hints->dryRun)
818  return;
819  posix_options_t * o = (posix_options_t*) param;
820  int fd = ((posix_fd*) afd)->fd;
821 #ifdef HAVE_GPU_DIRECT
822  if(o->gpuDirect){
823  cuFileHandleDeregister(((posix_fd*) afd)->cf_handle);
824  }
825 #endif
826  if (close(fd) != 0){
827  ERRF("close(%d) failed", fd);
828  }
829  free(afd);
830 }
831 
832 /*
833  * Delete a file through the POSIX interface.
834  */
835 void POSIX_Delete(char *testFileName, aiori_mod_opt_t * param)
836 {
837  if(hints->dryRun)
838  return;
839  if (unlink(testFileName) != 0){
840  WARNF("[RANK %03d]: unlink() of file \"%s\" failed", rank, testFileName);
841  }
842 }
843 
844 int POSIX_Rename(const char * oldfile, const char * newfile, aiori_mod_opt_t * module_options){
845  if(hints->dryRun)
846  return 0;
847 
848  if(rename(oldfile, newfile) != 0){
849  WARNF("[RANK %03d]: rename() of file \"%s\" to \"%s\" failed", rank, oldfile, newfile);
850  return -1;
851  }
852  return 0;
853 }
854 
855 /*
856  * Use POSIX stat() to return aggregate file size.
857  */
858 IOR_offset_t POSIX_GetFileSize(aiori_mod_opt_t * test, char *testFileName)
859 {
860  if(hints->dryRun)
861  return 0;
862  struct stat stat_buf;
863  IOR_offset_t aggFileSizeFromStat, tmpMin, tmpMax, tmpSum;
864 
865  if (stat(testFileName, &stat_buf) != 0) {
866  ERRF("stat(\"%s\", ...) failed", testFileName);
867  }
868  aggFileSizeFromStat = stat_buf.st_size;
869 
870  return (aggFileSizeFromStat);
871 }
872 
874 #ifdef HAVE_GPU_DIRECT
875  CUfileError_t err = cuFileDriverOpen();
876 #endif
877 }
878 
880 #ifdef HAVE_GPU_DIRECT
881  CUfileError_t err = cuFileDriverClose();
882 #endif
883 }
int gpfs_release_token
Definition: aiori-POSIX.h:20
#define ERRF(FORMAT,...)
Definition: aiori-debug.h:67
#define ISPOWEROFTWO(x)
Definition: ior.h:42
static void POSIX_Finalize(aiori_mod_opt_t *options)
Definition: aiori-POSIX.c:879
#define LAST_OPTION
Definition: option.h:39
#define VERBOSE_0
Definition: iordef.h:106
CURLcode rc
Definition: aiori-S3-4c.c:111
int errno
struct benchmark_options o
Definition: md-workbench.c:133
int POSIX_Mknod(char *testFileName)
Definition: aiori-POSIX.c:606
static IOR_offset_t POSIX_Xfer(int, aiori_fd_t *, IOR_size_t *, IOR_offset_t, IOR_offset_t, aiori_mod_opt_t *)
Definition: aiori-POSIX.c:671
static void POSIX_Initialize(aiori_mod_opt_t *options)
Definition: aiori-POSIX.c:873
void POSIX_Close(aiori_fd_t *afd, aiori_mod_opt_t *param)
Definition: aiori-POSIX.c:815
ior_aiori_t posix_aiori
Definition: aiori-POSIX.c:171
#define FAIL(...)
Definition: aiori-debug.h:16
int POSIX_Rename(const char *oldfile, const char *newfile, aiori_mod_opt_t *module_options)
Definition: aiori-POSIX.c:844
int lustre_set_striping
Definition: aiori-POSIX.h:12
#define IOR_RDONLY
Definition: aiori.h:28
#define MPI_CHECK(MPI_STATUS, MSG)
Definition: aiori-debug.h:97
#define WRITE
Definition: iordef.h:100
int aiori_posix_stat(const char *path, struct stat *buf, aiori_mod_opt_t *module_options)
Definition: aiori.c:230
int gpfs_createsharing
Definition: aiori-POSIX.h:24
aiori_fd_t * POSIX_Open(char *testFileName, int flags, aiori_mod_opt_t *param)
Definition: aiori-POSIX.c:620
char * aiori_get_version()
Definition: aiori.c:235
#define O_BINARY
Definition: aiori-POSIX.c:87
aiori_fd_t * POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t *param)
Definition: aiori-POSIX.c:488
MPI_Comm testComm
Definition: utilities.c:73
static option_help options[]
Definition: aiori-CEPHFS.c:59
void POSIX_Delete(char *testFileName, aiori_mod_opt_t *param)
Definition: aiori-POSIX.c:835
static aiori_xfer_hint_t * hints
Definition: aiori-POSIX.c:200
#define WARN(MSG)
Definition: aiori-debug.h:45
void POSIX_Sync(aiori_mod_opt_t *param)
Definition: aiori-POSIX.c:803
int lustre_stripe_count
Definition: aiori-POSIX.h:13
int singleXferAttempt
Definition: aiori.h:75
#define open64
Definition: aiori-POSIX.c:79
#define MAX_RETRY
Definition: iordef.h:115
int POSIX_check_params(aiori_mod_opt_t *param)
Definition: aiori-POSIX.c:206
#define INFO(MSG)
Definition: aiori-debug.h:62
int aiori_posix_access(const char *path, int mode, aiori_mod_opt_t *module_options)
Definition: aiori.c:225
#define IOR_WRONLY
Definition: aiori.h:29
#define WARNF(FORMAT,...)
Definition: aiori-debug.h:30
long long int IOR_size_t
Definition: iordef.h:124
int aiori_posix_rmdir(const char *path, aiori_mod_opt_t *module_options)
Definition: aiori.c:220
void POSIX_Fsync(aiori_fd_t *afd, aiori_mod_opt_t *param)
Definition: aiori-POSIX.c:795
#define VERBOSE_2
Definition: iordef.h:108
void POSIX_xfer_hints(aiori_xfer_hint_t *params)
Definition: aiori-POSIX.c:202
int aiori_posix_mkdir(const char *path, mode_t mode, aiori_mod_opt_t *module_options)
Definition: aiori.c:215
#define lseek64
Definition: aiori-POSIX.c:83
#define INFOF(FORMAT,...)
Definition: aiori-debug.h:50
int verbose
Definition: utilities.c:72
#define VERBOSE_4
Definition: iordef.h:110
int aiori_posix_statfs(const char *path, ior_aiori_statfs_t *stat_buf, aiori_mod_opt_t *module_options)
Definition: aiori.c:169
void set_o_direct_flag(int *flag)
Definition: utilities.c:332
#define ERR(MSG)
Definition: aiori-debug.h:75
#define VERBOSE_1
Definition: iordef.h:107
option_help * POSIX_options(aiori_mod_opt_t **init_backend_options, aiori_mod_opt_t *init_values)
Definition: aiori-POSIX.c:117
int fsyncPerWrite
Definition: aiori.h:70
char * name
Definition: aiori.h:88
int filePerProc
Definition: aiori.h:65
int lustre_stripe_size
Definition: aiori-POSIX.h:14
long long int IOR_offset_t
Definition: iordef.h:123
int gpfs_finegrain_readsharing
Definition: aiori-POSIX.h:23
int rank
Definition: utilities.c:70
int lustre_ignore_locks
Definition: aiori-POSIX.h:16
#define TRUE
Definition: iordef.h:80
IOR_offset_t POSIX_GetFileSize(aiori_mod_opt_t *test, char *testFileName)
Definition: aiori-POSIX.c:858
int gpfs_finegrain_writesharing
Definition: aiori-POSIX.h:22
void * safeMalloc(uint64_t size)
Definition: utilities.c:238
#define NULL
Definition: iordef.h:84