Source code for vsi.tools.gpu_check

import argparse
import ctypes
import glob
import logging
import os
import pathlib

logger = logging.getLogger(__name__)


[docs]def find_cudart(search_dirs):
  '''
  Find, load, and return the ``libcudart.so`` library.

  Paramters
  ---------
  search_dirs : :obj:`list`
    List of directories to search
  '''
  for search_dir in search_dirs:
    for file in pathlib.Path(search_dir).rglob('libcudart.so*'):
      try:
        return ctypes.cdll.LoadLibrary(file)
      except OSError:
        continue

  raise OSError("Failed to find & load libcudart.so")


[docs]def load_cudart(file=None):
  '''
  Load ``libcudart.so`` library.  If not provided as an input, search
  standard locations for the library.

  Paramters
  ---------
  file : :obj:`str`
    Location of ``libcudart.so``
  '''

  # load directly
  if file:
    return ctypes.cdll.LoadLibrary(file)

  # list of directories to search (via recursive glob)
  # - /usr/local/cuda*
  # - /usr/cuda*
  # - ${LD_LIBRARY_PATH}
  # - /usr
  search_dirs = list()

  search_dirs.extend(pathlib.Path('/usr/local').glob('cuda*'))
  search_dirs.extend(pathlib.Path('/usr').glob('cuda*'))

  LD_LIBRARY_PATH = os.getenv('LD_LIBRARY_PATH')
  if LD_LIBRARY_PATH:
    search_dirs.extend(LD_LIBRARY_PATH.split(os.pathsep))

  search_dirs.append('/usr')

  # ensure all search directories are pathlib
  search_dirs = [pathlib.Path(d) for d in search_dirs]

  # search for cudart
  return find_cudart(search_dirs)


[docs]def load_nvidia_uvm(file=None):
  '''
  A function to load the nvidia uvm device

  Some (older) Linux Operating systems do not load ``/dev/nvidia-uvm`` on boot
  to runlevel 3 (headless). This results in the ``nvidia-uvm`` module not being
  loaded.

  Unfortunately, a simple modprobe does not fix the issue, but a CUDA call on
  the host (not in a container) will.

  This scripts attempts to locate a ``libcudart.so`` library and calls the
  ``cudaGetDeviceCount`` function, which loads the ``/dev/nvidia-uvm`` driver.
  If it cannot locate the cuda runtime, you can give it the location as an
  argument.

  The CUDA Runtime is required on the host.
  '''

  # check file exists
  if file:
    file = pathlib.Path(file)
    if not file.is_file():
      raise OSError(f"File does not exist {file}")

  # load libcudart.so
  try:
    cudart = load_cudart(file)
  except OSError:
    if file:
      raise OSError(f"Failed to load cuda runtime from {file}")
    else:
      raise OSError("Failed to find & load cuda runtime. Try passing the "
                    "full path of cuda runtime as an argument.")

  # report
  logger.debug(f"found libcudart.so : {cudart._name}")

  # run cudaGetDeviceCount
  cudart.cudaGetDeviceCount.argtypes = (ctypes.POINTER(ctypes.c_int), )
  gpu_count = ctypes.c_int(-1)
  exit_code = cudart.cudaGetDeviceCount(ctypes.pointer(gpu_count))

  # report
  logger.debug(f"cudaGetDeviceCount : {exit_code=}, gpu_count={gpu_count.value}")


[docs]def gpu_check(file=None):
  '''Try to load nvidia-uvm if not already loaded'''

  # Only bother checking if there are any nvidia cards present
  if not glob.glob('/dev/nvidia[0-9]'):
    logger.debug('Skip gpu_check : /dev/nvidia[0-9] missing')
    return

  # is nvidia-uvm already loaded
  if os.path.exists('/dev/nvidia-uvm'):
    logger.debug('Skip gpu_check : /dev/nvidia-uvm already loaded')
    return

  # call load_nvidia_uvm
  try:
    load_nvidia_uvm(file)
  except OSError as err:
    logger.warning(f'load_nvidia_uvm failure : {err}')

  # nvidia-uvm report
  if os.path.exists('/dev/nvidia-uvm'):
    logger.debug("/dev/nvidia-uvm has been successfully loaded")
  else:
    logger.critical("load_nvidia_uvm ran but /dev/nvidia-uvm is still not loaded")


[docs]def main():
  '''
  Command line interface to :func:`gpu_check`
  '''

  # argument parser
  parser = argparse.ArgumentParser()
  parser.add_argument('--file', type=pathlib.Path,
                      default=None, required=False,
                      help="Path to libcudart.so")

  args = parser.parse_args()

  # basic logging when called from command line
  log_format = "[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s : %(message)s"
  logging.basicConfig(level='DEBUG', format=log_format)

  # run gpu_check
  gpu_check(args.file)


if __name__ == '__main__':
  main()