{-# LANGUAGE ForeignFunctionInterface #-}

-- |
-- Module      : GHC.OpenMP
-- License     : BSD-3-Clause
-- Stability   : experimental
--
-- = Overview
--
-- An OpenMP runtime that uses GHC's Runtime System (RTS) as its thread pool
-- and scheduler infrastructure.  Standard C code compiled with
-- @gcc -fopenmp@ runs on GHC Capabilities instead of libgomp's pthreads,
-- enabling seamless interoperation between Haskell and OpenMP-parallelized C.
--
-- Full documentation with benchmarks:
-- <https://jhhuh.github.io/ghc-openmp/>
--
-- = How it works
--
-- The runtime implements the GCC @GOMP_*@ ABI and the @omp_*@ user API in a
-- single C file (@cbits\/ghc_omp_runtime_rts.c@, ~1300 lines).  On the first
-- @GOMP_parallel@ call, it initializes the GHC RTS (or increments its ref
-- count if already running from Haskell) and creates N-1 OS worker threads
-- pinned to GHC Capabilities via @rts_setInCallCapability()@.
--
-- Workers are /not/ Haskell threads — they are plain OS threads that spin on
-- atomic variables, invisible to GHC's garbage collector.  This means:
--
--   * GC does not pause OpenMP workers (they don't hold Capabilities)
--   * Haskell green threads and OpenMP parallel regions can run simultaneously
--   * @foreign import ccall safe@ releases the Capability, so other Haskell
--     threads run while OpenMP executes
--
-- = Usage from Haskell
--
-- Add @ghc-openmp@ to your @build-depends@.  The C runtime source is compiled
-- directly into your package using your own GHC, so there are no ABI conflicts
-- with your RTS version.
--
-- @
-- -- In your .cabal file:
-- build-depends: ghc-openmp
-- ghc-options:   -threaded
--
-- -- In your Haskell code:
-- foreign import ccall safe "omp_parallel_sinsum"
--   c_sinsum :: CInt -> IO CDouble
-- @
--
-- = Usage from C
--
-- Build @libghcomp.so@ via @make@ or @nix build@, then link with @-lghcomp@:
--
-- @
-- gcc -fopenmp my_program.c -lghcomp -o my_program
-- @
--
-- The shared library embeds the GHC RTS via rpath — C consumers don't need
-- to know about GHC at all.  A pkg-config template is shipped in the
-- package's @data\/@ directory (see 'pkgConfigTemplatePath').
--
-- = Performance
--
-- After lock-free optimization (sense-reversing barriers, generation-counter
-- dispatch), the runtime achieves performance parity with native libgomp:
--
-- +----------------------+----------------+------------+-------+
-- | Metric               | Native libgomp | RTS-backed | Ratio |
-- +======================+================+============+=======+
-- | Fork/join            | 0.97 us        | 0.81 us    | 0.83x |
-- +----------------------+----------------+------------+-------+
-- | Barrier              | 0.51 us        | 0.25 us    | 0.50x |
-- +----------------------+----------------+------------+-------+
-- | Parallel for (1M)    | 3.85 ms        | 3.91 ms    | 1.01x |
-- +----------------------+----------------+------------+-------+
-- | DGEMM 1024           | 748.8 ms       | 663.4 ms   | 0.89x |
-- +----------------------+----------------+------------+-------+
--
-- = Calling convention overhead
--
-- When calling C from Haskell, the FFI calling convention matters:
--
-- +----------------------------------+---------+----------------------------------+
-- | Convention                       | ns/call | Notes                            |
-- +==================================+=========+==================================+
-- | @foreign import prim@ (Cmm)      | ~0      | GHC can optimize away (LICM)     |
-- +----------------------------------+---------+----------------------------------+
-- | @foreign import ccall unsafe@    | ~2      | STG register save/restore        |
-- +----------------------------------+---------+----------------------------------+
-- | @foreign import ccall safe@      | ~68     | + Capability release/reacquire   |
-- +----------------------------------+---------+----------------------------------+
--
-- Use @unsafe@ for fast, non-blocking C functions.  Use @safe@ for functions
-- that may block or call back into Haskell.

module GHC.OpenMP
    ( -- * Thread queries
      -- | Query and control the OpenMP thread team.  These correspond to the
      -- standard @omp_*@ API from the OpenMP specification.
      --
      -- All thread query functions use @ccall unsafe@ since they are simple
      -- reads of thread-local variables (~2 ns overhead).
      ompGetNumThreads
    , ompGetThreadNum
    , ompGetMaxThreads
    , ompGetNumProcs
    , ompSetNumThreads
    , ompInParallel
      -- * Timing
      -- | Wall-clock timing functions for benchmarking parallel regions.
      -- Uses @clock_gettime(CLOCK_MONOTONIC)@ internally.
    , ompGetWtime
    , ompGetWtick
      -- * Nesting
      -- | Query nesting state.  Our runtime serializes nested parallel regions
      -- (inner regions execute with 1 thread) but tracks nesting level up to
      -- 8 levels deep for @omp_get_level()@ and @omp_get_active_level()@.
    , ompGetLevel
    , ompGetActiveLevel
      -- * Pkg-config
      -- | Access the pkg-config template for building @libghcomp.so@.
    , pkgConfigTemplatePath
    ) where

import Paths_ghc_openmp (getDataFileName)

-- | Returns the number of threads in the current parallel team.
-- Outside a parallel region, returns 1.
--
-- Corresponds to @omp_get_num_threads()@ in the OpenMP specification.
-- Reads from a thread-local variable — essentially free.
foreign import ccall unsafe "omp_get_num_threads" ompGetNumThreads :: IO Int

-- | Returns the thread number of the calling thread within the current
-- parallel team (0 = master, 1..N-1 = workers).
-- Outside a parallel region, returns 0.
--
-- Corresponds to @omp_get_thread_num()@.  Reads from a thread-local variable.
foreign import ccall unsafe "omp_get_thread_num"  ompGetThreadNum  :: IO Int

-- | Returns the maximum number of threads that could form a parallel team.
-- Equivalent to the value of @OMP_NUM_THREADS@ or the number of GHC
-- Capabilities (whichever the runtime was initialized with).
--
-- Corresponds to @omp_get_max_threads()@.
foreign import ccall unsafe "omp_get_max_threads" ompGetMaxThreads :: IO Int

-- | Returns the number of processors available to the program.
-- Uses @sysconf(_SC_NPROCESSORS_ONLN)@ on Linux.
--
-- Corresponds to @omp_get_num_procs()@.
foreign import ccall unsafe "omp_get_num_procs"   ompGetNumProcs   :: IO Int

-- | Sets the number of threads for subsequent parallel regions.
-- Takes effect at the next @GOMP_parallel@ call.
--
-- Corresponds to @omp_set_num_threads()@.
--
-- Note: cannot exceed @GHC_OMP_MAX_THREADS@ (default 64).
foreign import ccall unsafe "omp_set_num_threads" ompSetNumThreads :: Int -> IO ()

-- | Returns 1 if called from within an active parallel region, 0 otherwise.
--
-- Corresponds to @omp_in_parallel()@.
foreign import ccall unsafe "omp_in_parallel"     ompInParallel    :: IO Int

-- | Returns the elapsed wall-clock time in seconds since an arbitrary
-- reference point (monotonic clock).  Useful for timing parallel regions:
--
-- @
-- t0 <- ompGetWtime
-- -- ... parallel work ...
-- t1 <- ompGetWtime
-- putStrLn $ "Elapsed: " ++ show (t1 - t0) ++ " s"
-- @
--
-- Corresponds to @omp_get_wtime()@.  Uses @clock_gettime(CLOCK_MONOTONIC)@.
foreign import ccall unsafe "omp_get_wtime"       ompGetWtime      :: IO Double

-- | Returns the precision of the timer used by 'ompGetWtime', in seconds.
-- Typically @1e-9@ (nanosecond resolution) on modern Linux.
--
-- Corresponds to @omp_get_wtick()@.
foreign import ccall unsafe "omp_get_wtick"       ompGetWtick      :: IO Double

-- | Returns the current nesting level of parallel regions.
-- The outermost parallel region is level 1; outside any parallel region,
-- returns 0.
--
-- Our runtime supports serialized nesting up to 8 levels deep.
--
-- Corresponds to @omp_get_level()@.
foreign import ccall unsafe "omp_get_level"       ompGetLevel      :: IO Int

-- | Returns the number of /active/ (non-serialized) nesting levels.
-- Since our runtime serializes inner parallel regions, this is at most 1.
--
-- Corresponds to @omp_get_active_level()@.
foreign import ccall unsafe "omp_get_active_level" ompGetActiveLevel :: IO Int

-- | Returns the file path to the @ghcomp.pc.in@ pkg-config template
-- shipped with this package.  This template can be used to generate a
-- @ghcomp.pc@ file for @pkg-config@ integration:
--
-- @
-- template <- pkgConfigTemplatePath
-- -- template points to: \<package-data-dir\>/ghcomp.pc.in
-- -- Substitute \@PREFIX\@ and \@VERSION\@ to generate ghcomp.pc
-- @
--
-- C programs can then use:
--
-- @
-- gcc -fopenmp my_code.c $(pkg-config --cflags --libs ghcomp) -o my_code
-- @
pkgConfigTemplatePath :: IO FilePath
pkgConfigTemplatePath :: IO FilePath
pkgConfigTemplatePath = FilePath -> IO FilePath
getDataFileName FilePath
"ghcomp.pc.in"