michael@0: /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
michael@0:  * vim: set ts=8 sts=4 et sw=4 tw=99:
michael@0:  * This Source Code Form is subject to the terms of the Mozilla Public
michael@0:  * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0:  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0: 
michael@0: #ifndef vm_ThreadPool_h
michael@0: #define vm_ThreadPool_h
michael@0: 
michael@0: #include "mozilla/Atomics.h"
michael@0: 
michael@0: #include "jsalloc.h"
michael@0: #include "jslock.h"
michael@0: #include "jsmath.h"
michael@0: #include "jspubtd.h"
michael@0: 
michael@0: #include "js/Vector.h"
michael@0: #include "vm/Monitor.h"
michael@0: 
michael@0: struct JSRuntime;
michael@0: struct JSCompartment;
michael@0: 
michael@0: namespace js {
michael@0: 
michael@0: class ThreadPool;
michael@0: 
michael@0: /////////////////////////////////////////////////////////////////////////////
michael@0: // ThreadPoolWorker
michael@0: //
michael@0: // Class for worker threads in the pool. All threads (i.e. helpers and main
michael@0: // thread) have a worker associted with them. By convention, the worker id of
michael@0: // the main thread is 0.
michael@0: 
michael@0: class ThreadPoolWorker
michael@0: {
michael@0:     const uint32_t workerId_;
michael@0:     ThreadPool *pool_;
michael@0: 
michael@0:     // Slices this thread is responsible for.
michael@0:     //
michael@0:     // This a uint32 composed of two uint16s (the lower and upper bounds) so
michael@0:     // that we may do a single CAS. See {Compose,Decompose}SliceBounds
michael@0:     // functions below.
michael@0:     mozilla::Atomic<uint32_t, mozilla::ReleaseAcquire> sliceBounds_;
michael@0: 
michael@0:     // Current point in the worker's lifecycle.
michael@0:     volatile enum WorkerState {
michael@0:         CREATED, ACTIVE, TERMINATED
michael@0:     } state_;
michael@0: 
michael@0:     // Per-worker scheduler RNG state used for picking a random worker during
michael@0:     // work stealing.
michael@0:     uint32_t schedulerRNGState_;
michael@0: 
michael@0:     // The thread's main function.
michael@0:     static void HelperThreadMain(void *arg);
michael@0:     void helperLoop();
michael@0: 
michael@0:     bool hasWork() const;
michael@0:     bool popSliceFront(uint16_t *sliceId);
michael@0:     bool popSliceBack(uint16_t *sliceId);
michael@0:     bool stealFrom(ThreadPoolWorker *victim, uint16_t *sliceId);
michael@0: 
michael@0:     // Get a worker at random from the pool using our own thread-local RNG
michael@0:     // state. This is a weak, but very fast, random function [1]. We choose
michael@0:     // [a,b,c] = 11,21,13.
michael@0:     //
michael@0:     // [1] http://www.jstatsoft.org/v08/i14/paper
michael@0:   public:
michael@0:     static const uint32_t XORSHIFT_A = 11;
michael@0:     static const uint32_t XORSHIFT_B = 21;
michael@0:     static const uint32_t XORSHIFT_C = 13;
michael@0: 
michael@0:   private:
michael@0:     ThreadPoolWorker *randomWorker();
michael@0: 
michael@0:   public:
michael@0:     ThreadPoolWorker(uint32_t workerId, uint32_t rngSeed, ThreadPool *pool);
michael@0: 
michael@0:     uint32_t id() const { return workerId_; }
michael@0:     bool isMainThread() const { return id() == 0; }
michael@0: 
michael@0:     // Submits a new set of slices. Assumes !hasWork().
michael@0:     void submitSlices(uint16_t sliceStart, uint16_t sliceEnd);
michael@0: 
michael@0:     // Get the next slice; work stealing happens here if work stealing is
michael@0:     // on. Returns false if there are no more slices to hand out.
michael@0:     bool getSlice(ForkJoinContext *cx, uint16_t *sliceId);
michael@0: 
michael@0:     // Discard remaining slices. Used for aborting jobs.
michael@0:     void discardSlices();
michael@0: 
michael@0:     // Invoked from the main thread; signals worker to start.
michael@0:     bool start();
michael@0: 
michael@0:     // Invoked from the main thread; signals the worker loop to return.
michael@0:     void terminate(AutoLockMonitor &lock);
michael@0: 
michael@0:     static size_t offsetOfSliceBounds() {
michael@0:         return offsetof(ThreadPoolWorker, sliceBounds_);
michael@0:     }
michael@0: 
michael@0:     static size_t offsetOfSchedulerRNGState() {
michael@0:         return offsetof(ThreadPoolWorker, schedulerRNGState_);
michael@0:     }
michael@0: };
michael@0: 
michael@0: /////////////////////////////////////////////////////////////////////////////
michael@0: // A ParallelJob is the main runnable abstraction in the ThreadPool.
michael@0: //
michael@0: // The unit of work here is in terms of threads, *not* slices. The
michael@0: // user-provided function has the responsibility of getting slices of work via
michael@0: // the |ForkJoinGetSlice| intrinsic.
michael@0: 
michael@0: class ParallelJob
michael@0: {
michael@0:   public:
michael@0:     virtual bool executeFromWorker(ThreadPoolWorker *worker, uintptr_t stackLimit) = 0;
michael@0:     virtual bool executeFromMainThread(ThreadPoolWorker *mainWorker) = 0;
michael@0: };
michael@0: 
michael@0: /////////////////////////////////////////////////////////////////////////////
michael@0: // ThreadPool used for parallel JavaScript execution. Unless you are building
michael@0: // a new kind of parallel service, it is very likely that you do not wish to
michael@0: // interact with the threadpool directly. In particular, if you wish to
michael@0: // execute JavaScript in parallel, you probably want to look at |js::ForkJoin|
michael@0: // in |forkjoin.cpp|.
michael@0: //
michael@0: // The ThreadPool always maintains a fixed pool of worker threads.  You can
michael@0: // query the number of worker threads via the method |numWorkers()|.  Note
michael@0: // that this number may be zero (generally if threads are disabled, or when
michael@0: // manually specified for benchmarking purposes).
michael@0: //
michael@0: // The way to submit a job is using |executeJob()|---in this case, the job
michael@0: // will be executed by all worker threads, including the main thread. This
michael@0: // does not fail if there are no worker threads, it simply runs all the work
michael@0: // using the main thread only.
michael@0: //
michael@0: // Of course, each thread may have any number of previously submitted things
michael@0: // that they are already working on, and so they will finish those before they
michael@0: // get to this job.  Therefore it is possible to have some worker threads pick
michael@0: // up (and even finish) their piece of the job before others have even
michael@0: // started. The main thread is also used by the pool as a worker thread.
michael@0: //
michael@0: // The ThreadPool supports work stealing. Every time a worker completes all
michael@0: // the slices in its local queue, it tries to acquire some work from other
michael@0: // workers (including the main thread).  Execution terminates when there is no
michael@0: // work left to be done, i.e., when all the workers have an empty queue. The
michael@0: // stealing algorithm operates in 2 phases: (1) workers process all the slices
michael@0: // in their local queue, and then (2) workers try to steal from other peers.
michael@0: // Since workers start to steal only *after* they have completed all the
michael@0: // slices in their queue, the design is particularly convenient in the context
michael@0: // of Fork/Join-like parallelism, where workers receive a bunch of slices to
michael@0: // be done at the very beginning of the job, and have to wait until all the
michael@0: // threads have joined back. During phase (1) there is no synchronization
michael@0: // overhead between workers introduced by the stealing algorithm, and
michael@0: // therefore the execution overhead introduced is almost zero with balanced
michael@0: // workloads. The way a |ParallelJob| is divided into multiple slices has to
michael@0: // be specified by the instance implementing the job (e.g., |ForkJoinShared|
michael@0: // in |ForkJoin.cpp|).
michael@0: 
michael@0: class ThreadPool : public Monitor
michael@0: {
michael@0:   private:
michael@0:     friend class ThreadPoolWorker;
michael@0: 
michael@0:     // Initialized lazily.
michael@0:     js::Vector<ThreadPoolWorker *, 8, SystemAllocPolicy> workers_;
michael@0: 
michael@0:     // The number of active workers. Should only access under lock.
michael@0:     uint32_t activeWorkers_;
michael@0:     PRCondVar *joinBarrier_;
michael@0: 
michael@0:     // The current job.
michael@0:     ParallelJob *job_;
michael@0: 
michael@0: #ifdef DEBUG
michael@0:     // Initialized at startup only.
michael@0:     JSRuntime *const runtime_;
michael@0: 
michael@0:     // Number of stolen slices in the last parallel job.
michael@0:     mozilla::Atomic<uint32_t, mozilla::ReleaseAcquire> stolenSlices_;
michael@0: #endif
michael@0: 
michael@0:     // Number of pending slices in the current job.
michael@0:     mozilla::Atomic<uint32_t, mozilla::ReleaseAcquire> pendingSlices_;
michael@0: 
michael@0:     // Whether the main thread is currently processing slices.
michael@0:     bool isMainThreadActive_;
michael@0: 
michael@0:     bool lazyStartWorkers(JSContext *cx);
michael@0:     void terminateWorkers();
michael@0:     void terminateWorkersAndReportOOM(JSContext *cx);
michael@0:     void join(AutoLockMonitor &lock);
michael@0:     void waitForWorkers(AutoLockMonitor &lock);
michael@0:     ThreadPoolWorker *mainThreadWorker() { return workers_[0]; }
michael@0: 
michael@0:   public:
michael@0: #ifdef DEBUG
michael@0:     static size_t offsetOfStolenSlices() {
michael@0:         return offsetof(ThreadPool, stolenSlices_);
michael@0:     }
michael@0: #endif
michael@0:     static size_t offsetOfPendingSlices() {
michael@0:         return offsetof(ThreadPool, pendingSlices_);
michael@0:     }
michael@0:     static size_t offsetOfWorkers() {
michael@0:         return offsetof(ThreadPool, workers_);
michael@0:     }
michael@0: 
michael@0:     static const uint16_t MAX_SLICE_ID = UINT16_MAX;
michael@0: 
michael@0:     ThreadPool(JSRuntime *rt);
michael@0:     ~ThreadPool();
michael@0: 
michael@0:     bool init();
michael@0: 
michael@0:     // Return number of worker threads in the pool, counting the main thread.
michael@0:     uint32_t numWorkers() const;
michael@0: 
michael@0:     // Returns whether we have any pending slices.
michael@0:     bool hasWork() const { return pendingSlices_ != 0; }
michael@0: 
michael@0:     // Returns the current job. Must have one.
michael@0:     ParallelJob *job() const {
michael@0:         MOZ_ASSERT(job_);
michael@0:         return job_;
michael@0:     }
michael@0: 
michael@0:     // Returns whether or not the scheduler should perform work stealing.
michael@0:     bool workStealing() const;
michael@0: 
michael@0:     // Returns whether or not the main thread is working.
michael@0:     bool isMainThreadActive() const { return isMainThreadActive_; }
michael@0: 
michael@0: #ifdef DEBUG
michael@0:     // Return the number of stolen slices in the last parallel job.
michael@0:     uint16_t stolenSlices() { return stolenSlices_; }
michael@0: #endif
michael@0: 
michael@0:     // Wait until all worker threads have finished their current set
michael@0:     // of slices and then return.  You must not submit new jobs after
michael@0:     // invoking |terminate()|.
michael@0:     void terminate();
michael@0: 
michael@0:     // Execute the given ParallelJob using the main thread and any available worker.
michael@0:     // Blocks until the main thread has completed execution.
michael@0:     ParallelResult executeJob(JSContext *cx, ParallelJob *job, uint16_t sliceStart,
michael@0:                               uint16_t numSlices);
michael@0: 
michael@0:     // Abort the current job.
michael@0:     void abortJob();
michael@0: };
michael@0: 
michael@0: } // namespace js
michael@0: 
michael@0: #endif /* vm_ThreadPool_h */