os_linux.cpp revision 13134:2befe2aca4b4
1/*
2 * Copyright (c) 1999, 2017, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25// no precompiled headers
26#include "classfile/classLoader.hpp"
27#include "classfile/systemDictionary.hpp"
28#include "classfile/vmSymbols.hpp"
29#include "code/icBuffer.hpp"
30#include "code/vtableStubs.hpp"
31#include "compiler/compileBroker.hpp"
32#include "compiler/disassembler.hpp"
33#include "interpreter/interpreter.hpp"
34#include "jvm_linux.h"
35#include "logging/log.hpp"
36#include "memory/allocation.inline.hpp"
37#include "memory/filemap.hpp"
38#include "oops/oop.inline.hpp"
39#include "os_linux.inline.hpp"
40#include "os_share_linux.hpp"
41#include "prims/jniFastGetField.hpp"
42#include "prims/jvm.h"
43#include "prims/jvm_misc.hpp"
44#include "runtime/arguments.hpp"
45#include "runtime/atomic.hpp"
46#include "runtime/extendedPC.hpp"
47#include "runtime/globals.hpp"
48#include "runtime/interfaceSupport.hpp"
49#include "runtime/init.hpp"
50#include "runtime/java.hpp"
51#include "runtime/javaCalls.hpp"
52#include "runtime/mutexLocker.hpp"
53#include "runtime/objectMonitor.hpp"
54#include "runtime/orderAccess.inline.hpp"
55#include "runtime/osThread.hpp"
56#include "runtime/perfMemory.hpp"
57#include "runtime/sharedRuntime.hpp"
58#include "runtime/statSampler.hpp"
59#include "runtime/stubRoutines.hpp"
60#include "runtime/thread.inline.hpp"
61#include "runtime/threadCritical.hpp"
62#include "runtime/timer.hpp"
63#include "semaphore_posix.hpp"
64#include "services/attachListener.hpp"
65#include "services/memTracker.hpp"
66#include "services/runtimeService.hpp"
67#include "utilities/decoder.hpp"
68#include "utilities/defaultStream.hpp"
69#include "utilities/events.hpp"
70#include "utilities/elfFile.hpp"
71#include "utilities/growableArray.hpp"
72#include "utilities/macros.hpp"
73#include "utilities/vmError.hpp"
74
75// put OS-includes here
76# include <sys/types.h>
77# include <sys/mman.h>
78# include <sys/stat.h>
79# include <sys/select.h>
80# include <pthread.h>
81# include <signal.h>
82# include <errno.h>
83# include <dlfcn.h>
84# include <stdio.h>
85# include <unistd.h>
86# include <sys/resource.h>
87# include <pthread.h>
88# include <sys/stat.h>
89# include <sys/time.h>
90# include <sys/times.h>
91# include <sys/utsname.h>
92# include <sys/socket.h>
93# include <sys/wait.h>
94# include <pwd.h>
95# include <poll.h>
96# include <semaphore.h>
97# include <fcntl.h>
98# include <string.h>
99# include <syscall.h>
100# include <sys/sysinfo.h>
101# include <gnu/libc-version.h>
102# include <sys/ipc.h>
103# include <sys/shm.h>
104# include <link.h>
105# include <stdint.h>
106# include <inttypes.h>
107# include <sys/ioctl.h>
108
109#ifndef _GNU_SOURCE
110  #define _GNU_SOURCE
111  #include <sched.h>
112  #undef _GNU_SOURCE
113#else
114  #include <sched.h>
115#endif
116
117// if RUSAGE_THREAD for getrusage() has not been defined, do it here. The code calling
118// getrusage() is prepared to handle the associated failure.
119#ifndef RUSAGE_THREAD
120  #define RUSAGE_THREAD   (1)               /* only the calling thread */
121#endif
122
123#define MAX_PATH    (2 * K)
124
125#define MAX_SECS 100000000
126
127// for timer info max values which include all bits
128#define ALL_64_BITS CONST64(0xFFFFFFFFFFFFFFFF)
129
130#define LARGEPAGES_BIT (1 << 6)
131////////////////////////////////////////////////////////////////////////////////
132// global variables
133julong os::Linux::_physical_memory = 0;
134
135address   os::Linux::_initial_thread_stack_bottom = NULL;
136uintptr_t os::Linux::_initial_thread_stack_size   = 0;
137
138int (*os::Linux::_clock_gettime)(clockid_t, struct timespec *) = NULL;
139int (*os::Linux::_pthread_getcpuclockid)(pthread_t, clockid_t *) = NULL;
140int (*os::Linux::_pthread_setname_np)(pthread_t, const char*) = NULL;
141Mutex* os::Linux::_createThread_lock = NULL;
142pthread_t os::Linux::_main_thread;
143int os::Linux::_page_size = -1;
144bool os::Linux::_supports_fast_thread_cpu_time = false;
145uint32_t os::Linux::_os_version = 0;
146const char * os::Linux::_glibc_version = NULL;
147const char * os::Linux::_libpthread_version = NULL;
148
149static jlong initial_time_count=0;
150
151static int clock_tics_per_sec = 100;
152
153// For diagnostics to print a message once. see run_periodic_checks
154static sigset_t check_signal_done;
155static bool check_signals = true;
156
157// Signal number used to suspend/resume a thread
158
159// do not use any signal number less than SIGSEGV, see 4355769
160static int SR_signum = SIGUSR2;
161sigset_t SR_sigset;
162
163// utility functions
164
165static int SR_initialize();
166
167julong os::available_memory() {
168  return Linux::available_memory();
169}
170
171julong os::Linux::available_memory() {
172  // values in struct sysinfo are "unsigned long"
173  struct sysinfo si;
174  sysinfo(&si);
175
176  return (julong)si.freeram * si.mem_unit;
177}
178
179julong os::physical_memory() {
180  return Linux::physical_memory();
181}
182
183// Return true if user is running as root.
184
185bool os::have_special_privileges() {
186  static bool init = false;
187  static bool privileges = false;
188  if (!init) {
189    privileges = (getuid() != geteuid()) || (getgid() != getegid());
190    init = true;
191  }
192  return privileges;
193}
194
195
196#ifndef SYS_gettid
197// i386: 224, ia64: 1105, amd64: 186, sparc 143
198  #ifdef __ia64__
199    #define SYS_gettid 1105
200  #else
201    #ifdef __i386__
202      #define SYS_gettid 224
203    #else
204      #ifdef __amd64__
205        #define SYS_gettid 186
206      #else
207        #ifdef __sparc__
208          #define SYS_gettid 143
209        #else
210          #error define gettid for the arch
211        #endif
212      #endif
213    #endif
214  #endif
215#endif
216
217
218// pid_t gettid()
219//
220// Returns the kernel thread id of the currently running thread. Kernel
221// thread id is used to access /proc.
222pid_t os::Linux::gettid() {
223  int rslt = syscall(SYS_gettid);
224  assert(rslt != -1, "must be."); // old linuxthreads implementation?
225  return (pid_t)rslt;
226}
227
228// Most versions of linux have a bug where the number of processors are
229// determined by looking at the /proc file system.  In a chroot environment,
230// the system call returns 1.  This causes the VM to act as if it is
231// a single processor and elide locking (see is_MP() call).
232static bool unsafe_chroot_detected = false;
233static const char *unstable_chroot_error = "/proc file system not found.\n"
234                     "Java may be unstable running multithreaded in a chroot "
235                     "environment on Linux when /proc filesystem is not mounted.";
236
237void os::Linux::initialize_system_info() {
238  set_processor_count(sysconf(_SC_NPROCESSORS_CONF));
239  if (processor_count() == 1) {
240    pid_t pid = os::Linux::gettid();
241    char fname[32];
242    jio_snprintf(fname, sizeof(fname), "/proc/%d", pid);
243    FILE *fp = fopen(fname, "r");
244    if (fp == NULL) {
245      unsafe_chroot_detected = true;
246    } else {
247      fclose(fp);
248    }
249  }
250  _physical_memory = (julong)sysconf(_SC_PHYS_PAGES) * (julong)sysconf(_SC_PAGESIZE);
251  assert(processor_count() > 0, "linux error");
252}
253
254void os::init_system_properties_values() {
255  // The next steps are taken in the product version:
256  //
257  // Obtain the JAVA_HOME value from the location of libjvm.so.
258  // This library should be located at:
259  // <JAVA_HOME>/lib/{client|server}/libjvm.so.
260  //
261  // If "/jre/lib/" appears at the right place in the path, then we
262  // assume libjvm.so is installed in a JDK and we use this path.
263  //
264  // Otherwise exit with message: "Could not create the Java virtual machine."
265  //
266  // The following extra steps are taken in the debugging version:
267  //
268  // If "/jre/lib/" does NOT appear at the right place in the path
269  // instead of exit check for $JAVA_HOME environment variable.
270  //
271  // If it is defined and we are able to locate $JAVA_HOME/jre/lib/<arch>,
272  // then we append a fake suffix "hotspot/libjvm.so" to this path so
273  // it looks like libjvm.so is installed there
274  // <JAVA_HOME>/jre/lib/<arch>/hotspot/libjvm.so.
275  //
276  // Otherwise exit.
277  //
278  // Important note: if the location of libjvm.so changes this
279  // code needs to be changed accordingly.
280
281  // See ld(1):
282  //      The linker uses the following search paths to locate required
283  //      shared libraries:
284  //        1: ...
285  //        ...
286  //        7: The default directories, normally /lib and /usr/lib.
287#if defined(AMD64) || (defined(_LP64) && defined(SPARC)) || defined(PPC64) || defined(S390)
288  #define DEFAULT_LIBPATH "/usr/lib64:/lib64:/lib:/usr/lib"
289#else
290  #define DEFAULT_LIBPATH "/lib:/usr/lib"
291#endif
292
293// Base path of extensions installed on the system.
294#define SYS_EXT_DIR     "/usr/java/packages"
295#define EXTENSIONS_DIR  "/lib/ext"
296
297  // Buffer that fits several sprintfs.
298  // Note that the space for the colon and the trailing null are provided
299  // by the nulls included by the sizeof operator.
300  const size_t bufsize =
301    MAX2((size_t)MAXPATHLEN,  // For dll_dir & friends.
302         (size_t)MAXPATHLEN + sizeof(EXTENSIONS_DIR) + sizeof(SYS_EXT_DIR) + sizeof(EXTENSIONS_DIR)); // extensions dir
303  char *buf = (char *)NEW_C_HEAP_ARRAY(char, bufsize, mtInternal);
304
305  // sysclasspath, java_home, dll_dir
306  {
307    char *pslash;
308    os::jvm_path(buf, bufsize);
309
310    // Found the full path to libjvm.so.
311    // Now cut the path to <java_home>/jre if we can.
312    pslash = strrchr(buf, '/');
313    if (pslash != NULL) {
314      *pslash = '\0';            // Get rid of /libjvm.so.
315    }
316    pslash = strrchr(buf, '/');
317    if (pslash != NULL) {
318      *pslash = '\0';            // Get rid of /{client|server|hotspot}.
319    }
320    Arguments::set_dll_dir(buf);
321
322    if (pslash != NULL) {
323      pslash = strrchr(buf, '/');
324      if (pslash != NULL) {
325        *pslash = '\0';        // Get rid of /lib.
326      }
327    }
328    Arguments::set_java_home(buf);
329    set_boot_path('/', ':');
330  }
331
332  // Where to look for native libraries.
333  //
334  // Note: Due to a legacy implementation, most of the library path
335  // is set in the launcher. This was to accomodate linking restrictions
336  // on legacy Linux implementations (which are no longer supported).
337  // Eventually, all the library path setting will be done here.
338  //
339  // However, to prevent the proliferation of improperly built native
340  // libraries, the new path component /usr/java/packages is added here.
341  // Eventually, all the library path setting will be done here.
342  {
343    // Get the user setting of LD_LIBRARY_PATH, and prepended it. It
344    // should always exist (until the legacy problem cited above is
345    // addressed).
346    const char *v = ::getenv("LD_LIBRARY_PATH");
347    const char *v_colon = ":";
348    if (v == NULL) { v = ""; v_colon = ""; }
349    // That's +1 for the colon and +1 for the trailing '\0'.
350    char *ld_library_path = (char *)NEW_C_HEAP_ARRAY(char,
351                                                     strlen(v) + 1 +
352                                                     sizeof(SYS_EXT_DIR) + sizeof("/lib/") + sizeof(DEFAULT_LIBPATH) + 1,
353                                                     mtInternal);
354    sprintf(ld_library_path, "%s%s" SYS_EXT_DIR "/lib:" DEFAULT_LIBPATH, v, v_colon);
355    Arguments::set_library_path(ld_library_path);
356    FREE_C_HEAP_ARRAY(char, ld_library_path);
357  }
358
359  // Extensions directories.
360  sprintf(buf, "%s" EXTENSIONS_DIR ":" SYS_EXT_DIR EXTENSIONS_DIR, Arguments::get_java_home());
361  Arguments::set_ext_dirs(buf);
362
363  FREE_C_HEAP_ARRAY(char, buf);
364
365#undef DEFAULT_LIBPATH
366#undef SYS_EXT_DIR
367#undef EXTENSIONS_DIR
368}
369
370////////////////////////////////////////////////////////////////////////////////
371// breakpoint support
372
373void os::breakpoint() {
374  BREAKPOINT;
375}
376
377extern "C" void breakpoint() {
378  // use debugger to set breakpoint here
379}
380
381////////////////////////////////////////////////////////////////////////////////
382// signal support
383
384debug_only(static bool signal_sets_initialized = false);
385static sigset_t unblocked_sigs, vm_sigs;
386
387bool os::Linux::is_sig_ignored(int sig) {
388  struct sigaction oact;
389  sigaction(sig, (struct sigaction*)NULL, &oact);
390  void* ohlr = oact.sa_sigaction ? CAST_FROM_FN_PTR(void*,  oact.sa_sigaction)
391                                 : CAST_FROM_FN_PTR(void*,  oact.sa_handler);
392  if (ohlr == CAST_FROM_FN_PTR(void*, SIG_IGN)) {
393    return true;
394  } else {
395    return false;
396  }
397}
398
399void os::Linux::signal_sets_init() {
400  // Should also have an assertion stating we are still single-threaded.
401  assert(!signal_sets_initialized, "Already initialized");
402  // Fill in signals that are necessarily unblocked for all threads in
403  // the VM. Currently, we unblock the following signals:
404  // SHUTDOWN{1,2,3}_SIGNAL: for shutdown hooks support (unless over-ridden
405  //                         by -Xrs (=ReduceSignalUsage));
406  // BREAK_SIGNAL which is unblocked only by the VM thread and blocked by all
407  // other threads. The "ReduceSignalUsage" boolean tells us not to alter
408  // the dispositions or masks wrt these signals.
409  // Programs embedding the VM that want to use the above signals for their
410  // own purposes must, at this time, use the "-Xrs" option to prevent
411  // interference with shutdown hooks and BREAK_SIGNAL thread dumping.
412  // (See bug 4345157, and other related bugs).
413  // In reality, though, unblocking these signals is really a nop, since
414  // these signals are not blocked by default.
415  sigemptyset(&unblocked_sigs);
416  sigaddset(&unblocked_sigs, SIGILL);
417  sigaddset(&unblocked_sigs, SIGSEGV);
418  sigaddset(&unblocked_sigs, SIGBUS);
419  sigaddset(&unblocked_sigs, SIGFPE);
420#if defined(PPC64)
421  sigaddset(&unblocked_sigs, SIGTRAP);
422#endif
423  sigaddset(&unblocked_sigs, SR_signum);
424
425  if (!ReduceSignalUsage) {
426    if (!os::Linux::is_sig_ignored(SHUTDOWN1_SIGNAL)) {
427      sigaddset(&unblocked_sigs, SHUTDOWN1_SIGNAL);
428    }
429    if (!os::Linux::is_sig_ignored(SHUTDOWN2_SIGNAL)) {
430      sigaddset(&unblocked_sigs, SHUTDOWN2_SIGNAL);
431    }
432    if (!os::Linux::is_sig_ignored(SHUTDOWN3_SIGNAL)) {
433      sigaddset(&unblocked_sigs, SHUTDOWN3_SIGNAL);
434    }
435  }
436  // Fill in signals that are blocked by all but the VM thread.
437  sigemptyset(&vm_sigs);
438  if (!ReduceSignalUsage) {
439    sigaddset(&vm_sigs, BREAK_SIGNAL);
440  }
441  debug_only(signal_sets_initialized = true);
442
443}
444
445// These are signals that are unblocked while a thread is running Java.
446// (For some reason, they get blocked by default.)
447sigset_t* os::Linux::unblocked_signals() {
448  assert(signal_sets_initialized, "Not initialized");
449  return &unblocked_sigs;
450}
451
452// These are the signals that are blocked while a (non-VM) thread is
453// running Java. Only the VM thread handles these signals.
454sigset_t* os::Linux::vm_signals() {
455  assert(signal_sets_initialized, "Not initialized");
456  return &vm_sigs;
457}
458
459void os::Linux::hotspot_sigmask(Thread* thread) {
460
461  //Save caller's signal mask before setting VM signal mask
462  sigset_t caller_sigmask;
463  pthread_sigmask(SIG_BLOCK, NULL, &caller_sigmask);
464
465  OSThread* osthread = thread->osthread();
466  osthread->set_caller_sigmask(caller_sigmask);
467
468  pthread_sigmask(SIG_UNBLOCK, os::Linux::unblocked_signals(), NULL);
469
470  if (!ReduceSignalUsage) {
471    if (thread->is_VM_thread()) {
472      // Only the VM thread handles BREAK_SIGNAL ...
473      pthread_sigmask(SIG_UNBLOCK, vm_signals(), NULL);
474    } else {
475      // ... all other threads block BREAK_SIGNAL
476      pthread_sigmask(SIG_BLOCK, vm_signals(), NULL);
477    }
478  }
479}
480
481//////////////////////////////////////////////////////////////////////////////
482// detecting pthread library
483
484void os::Linux::libpthread_init() {
485  // Save glibc and pthread version strings.
486#if !defined(_CS_GNU_LIBC_VERSION) || \
487    !defined(_CS_GNU_LIBPTHREAD_VERSION)
488  #error "glibc too old (< 2.3.2)"
489#endif
490
491  size_t n = confstr(_CS_GNU_LIBC_VERSION, NULL, 0);
492  assert(n > 0, "cannot retrieve glibc version");
493  char *str = (char *)malloc(n, mtInternal);
494  confstr(_CS_GNU_LIBC_VERSION, str, n);
495  os::Linux::set_glibc_version(str);
496
497  n = confstr(_CS_GNU_LIBPTHREAD_VERSION, NULL, 0);
498  assert(n > 0, "cannot retrieve pthread version");
499  str = (char *)malloc(n, mtInternal);
500  confstr(_CS_GNU_LIBPTHREAD_VERSION, str, n);
501  os::Linux::set_libpthread_version(str);
502}
503
504/////////////////////////////////////////////////////////////////////////////
505// thread stack expansion
506
507// os::Linux::manually_expand_stack() takes care of expanding the thread
508// stack. Note that this is normally not needed: pthread stacks allocate
509// thread stack using mmap() without MAP_NORESERVE, so the stack is already
510// committed. Therefore it is not necessary to expand the stack manually.
511//
512// Manually expanding the stack was historically needed on LinuxThreads
513// thread stacks, which were allocated with mmap(MAP_GROWSDOWN). Nowadays
514// it is kept to deal with very rare corner cases:
515//
516// For one, user may run the VM on an own implementation of threads
517// whose stacks are - like the old LinuxThreads - implemented using
518// mmap(MAP_GROWSDOWN).
519//
520// Also, this coding may be needed if the VM is running on the primordial
521// thread. Normally we avoid running on the primordial thread; however,
522// user may still invoke the VM on the primordial thread.
523//
524// The following historical comment describes the details about running
525// on a thread stack allocated with mmap(MAP_GROWSDOWN):
526
527
528// Force Linux kernel to expand current thread stack. If "bottom" is close
529// to the stack guard, caller should block all signals.
530//
531// MAP_GROWSDOWN:
532//   A special mmap() flag that is used to implement thread stacks. It tells
533//   kernel that the memory region should extend downwards when needed. This
534//   allows early versions of LinuxThreads to only mmap the first few pages
535//   when creating a new thread. Linux kernel will automatically expand thread
536//   stack as needed (on page faults).
537//
538//   However, because the memory region of a MAP_GROWSDOWN stack can grow on
539//   demand, if a page fault happens outside an already mapped MAP_GROWSDOWN
540//   region, it's hard to tell if the fault is due to a legitimate stack
541//   access or because of reading/writing non-exist memory (e.g. buffer
542//   overrun). As a rule, if the fault happens below current stack pointer,
543//   Linux kernel does not expand stack, instead a SIGSEGV is sent to the
544//   application (see Linux kernel fault.c).
545//
546//   This Linux feature can cause SIGSEGV when VM bangs thread stack for
547//   stack overflow detection.
548//
549//   Newer version of LinuxThreads (since glibc-2.2, or, RH-7.x) and NPTL do
550//   not use MAP_GROWSDOWN.
551//
552// To get around the problem and allow stack banging on Linux, we need to
553// manually expand thread stack after receiving the SIGSEGV.
554//
555// There are two ways to expand thread stack to address "bottom", we used
556// both of them in JVM before 1.5:
557//   1. adjust stack pointer first so that it is below "bottom", and then
558//      touch "bottom"
559//   2. mmap() the page in question
560//
561// Now alternate signal stack is gone, it's harder to use 2. For instance,
562// if current sp is already near the lower end of page 101, and we need to
563// call mmap() to map page 100, it is possible that part of the mmap() frame
564// will be placed in page 100. When page 100 is mapped, it is zero-filled.
565// That will destroy the mmap() frame and cause VM to crash.
566//
567// The following code works by adjusting sp first, then accessing the "bottom"
568// page to force a page fault. Linux kernel will then automatically expand the
569// stack mapping.
570//
571// _expand_stack_to() assumes its frame size is less than page size, which
572// should always be true if the function is not inlined.
573
574static void NOINLINE _expand_stack_to(address bottom) {
575  address sp;
576  size_t size;
577  volatile char *p;
578
579  // Adjust bottom to point to the largest address within the same page, it
580  // gives us a one-page buffer if alloca() allocates slightly more memory.
581  bottom = (address)align_size_down((uintptr_t)bottom, os::Linux::page_size());
582  bottom += os::Linux::page_size() - 1;
583
584  // sp might be slightly above current stack pointer; if that's the case, we
585  // will alloca() a little more space than necessary, which is OK. Don't use
586  // os::current_stack_pointer(), as its result can be slightly below current
587  // stack pointer, causing us to not alloca enough to reach "bottom".
588  sp = (address)&sp;
589
590  if (sp > bottom) {
591    size = sp - bottom;
592    p = (volatile char *)alloca(size);
593    assert(p != NULL && p <= (volatile char *)bottom, "alloca problem?");
594    p[0] = '\0';
595  }
596}
597
598bool os::Linux::manually_expand_stack(JavaThread * t, address addr) {
599  assert(t!=NULL, "just checking");
600  assert(t->osthread()->expanding_stack(), "expand should be set");
601  assert(t->stack_base() != NULL, "stack_base was not initialized");
602
603  if (addr <  t->stack_base() && addr >= t->stack_reserved_zone_base()) {
604    sigset_t mask_all, old_sigset;
605    sigfillset(&mask_all);
606    pthread_sigmask(SIG_SETMASK, &mask_all, &old_sigset);
607    _expand_stack_to(addr);
608    pthread_sigmask(SIG_SETMASK, &old_sigset, NULL);
609    return true;
610  }
611  return false;
612}
613
614//////////////////////////////////////////////////////////////////////////////
615// create new thread
616
617// Thread start routine for all newly created threads
618static void *thread_native_entry(Thread *thread) {
619  // Try to randomize the cache line index of hot stack frames.
620  // This helps when threads of the same stack traces evict each other's
621  // cache lines. The threads can be either from the same JVM instance, or
622  // from different JVM instances. The benefit is especially true for
623  // processors with hyperthreading technology.
624  static int counter = 0;
625  int pid = os::current_process_id();
626  alloca(((pid ^ counter++) & 7) * 128);
627
628  thread->initialize_thread_current();
629
630  OSThread* osthread = thread->osthread();
631  Monitor* sync = osthread->startThread_lock();
632
633  osthread->set_thread_id(os::current_thread_id());
634
635  log_info(os, thread)("Thread is alive (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
636    os::current_thread_id(), (uintx) pthread_self());
637
638  if (UseNUMA) {
639    int lgrp_id = os::numa_get_group_id();
640    if (lgrp_id != -1) {
641      thread->set_lgrp_id(lgrp_id);
642    }
643  }
644  // initialize signal mask for this thread
645  os::Linux::hotspot_sigmask(thread);
646
647  // initialize floating point control register
648  os::Linux::init_thread_fpu_state();
649
650  // handshaking with parent thread
651  {
652    MutexLockerEx ml(sync, Mutex::_no_safepoint_check_flag);
653
654    // notify parent thread
655    osthread->set_state(INITIALIZED);
656    sync->notify_all();
657
658    // wait until os::start_thread()
659    while (osthread->get_state() == INITIALIZED) {
660      sync->wait(Mutex::_no_safepoint_check_flag);
661    }
662  }
663
664  // call one more level start routine
665  thread->run();
666
667  log_info(os, thread)("Thread finished (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
668    os::current_thread_id(), (uintx) pthread_self());
669
670  // If a thread has not deleted itself ("delete this") as part of its
671  // termination sequence, we have to ensure thread-local-storage is
672  // cleared before we actually terminate. No threads should ever be
673  // deleted asynchronously with respect to their termination.
674  if (Thread::current_or_null_safe() != NULL) {
675    assert(Thread::current_or_null_safe() == thread, "current thread is wrong");
676    thread->clear_thread_current();
677  }
678
679  return 0;
680}
681
682bool os::create_thread(Thread* thread, ThreadType thr_type,
683                       size_t req_stack_size) {
684  assert(thread->osthread() == NULL, "caller responsible");
685
686  // Allocate the OSThread object
687  OSThread* osthread = new OSThread(NULL, NULL);
688  if (osthread == NULL) {
689    return false;
690  }
691
692  // set the correct thread state
693  osthread->set_thread_type(thr_type);
694
695  // Initial state is ALLOCATED but not INITIALIZED
696  osthread->set_state(ALLOCATED);
697
698  thread->set_osthread(osthread);
699
700  // init thread attributes
701  pthread_attr_t attr;
702  pthread_attr_init(&attr);
703  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
704
705  // Calculate stack size if it's not specified by caller.
706  size_t stack_size = os::Posix::get_initial_stack_size(thr_type, req_stack_size);
707  // In the Linux NPTL pthread implementation the guard size mechanism
708  // is not implemented properly. The posix standard requires adding
709  // the size of the guard pages to the stack size, instead Linux
710  // takes the space out of 'stacksize'. Thus we adapt the requested
711  // stack_size by the size of the guard pages to mimick proper
712  // behaviour. However, be careful not to end up with a size
713  // of zero due to overflow. Don't add the guard page in that case.
714  size_t guard_size = os::Linux::default_guard_size(thr_type);
715  if (stack_size <= SIZE_MAX - guard_size) {
716    stack_size += guard_size;
717  }
718  assert(is_size_aligned(stack_size, os::vm_page_size()), "stack_size not aligned");
719
720  int status = pthread_attr_setstacksize(&attr, stack_size);
721  assert_status(status == 0, status, "pthread_attr_setstacksize");
722
723  // Configure glibc guard page.
724  pthread_attr_setguardsize(&attr, os::Linux::default_guard_size(thr_type));
725
726  ThreadState state;
727
728  {
729    pthread_t tid;
730    int ret = pthread_create(&tid, &attr, (void* (*)(void*)) thread_native_entry, thread);
731
732    char buf[64];
733    if (ret == 0) {
734      log_info(os, thread)("Thread started (pthread id: " UINTX_FORMAT ", attributes: %s). ",
735        (uintx) tid, os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
736    } else {
737      log_warning(os, thread)("Failed to start thread - pthread_create failed (%s) for attributes: %s.",
738        os::errno_name(ret), os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
739    }
740
741    pthread_attr_destroy(&attr);
742
743    if (ret != 0) {
744      // Need to clean up stuff we've allocated so far
745      thread->set_osthread(NULL);
746      delete osthread;
747      return false;
748    }
749
750    // Store pthread info into the OSThread
751    osthread->set_pthread_id(tid);
752
753    // Wait until child thread is either initialized or aborted
754    {
755      Monitor* sync_with_child = osthread->startThread_lock();
756      MutexLockerEx ml(sync_with_child, Mutex::_no_safepoint_check_flag);
757      while ((state = osthread->get_state()) == ALLOCATED) {
758        sync_with_child->wait(Mutex::_no_safepoint_check_flag);
759      }
760    }
761  }
762
763  // Aborted due to thread limit being reached
764  if (state == ZOMBIE) {
765    thread->set_osthread(NULL);
766    delete osthread;
767    return false;
768  }
769
770  // The thread is returned suspended (in state INITIALIZED),
771  // and is started higher up in the call chain
772  assert(state == INITIALIZED, "race condition");
773  return true;
774}
775
776/////////////////////////////////////////////////////////////////////////////
777// attach existing thread
778
779// bootstrap the main thread
780bool os::create_main_thread(JavaThread* thread) {
781  assert(os::Linux::_main_thread == pthread_self(), "should be called inside main thread");
782  return create_attached_thread(thread);
783}
784
785bool os::create_attached_thread(JavaThread* thread) {
786#ifdef ASSERT
787  thread->verify_not_published();
788#endif
789
790  // Allocate the OSThread object
791  OSThread* osthread = new OSThread(NULL, NULL);
792
793  if (osthread == NULL) {
794    return false;
795  }
796
797  // Store pthread info into the OSThread
798  osthread->set_thread_id(os::Linux::gettid());
799  osthread->set_pthread_id(::pthread_self());
800
801  // initialize floating point control register
802  os::Linux::init_thread_fpu_state();
803
804  // Initial thread state is RUNNABLE
805  osthread->set_state(RUNNABLE);
806
807  thread->set_osthread(osthread);
808
809  if (UseNUMA) {
810    int lgrp_id = os::numa_get_group_id();
811    if (lgrp_id != -1) {
812      thread->set_lgrp_id(lgrp_id);
813    }
814  }
815
816  if (os::Linux::is_initial_thread()) {
817    // If current thread is initial thread, its stack is mapped on demand,
818    // see notes about MAP_GROWSDOWN. Here we try to force kernel to map
819    // the entire stack region to avoid SEGV in stack banging.
820    // It is also useful to get around the heap-stack-gap problem on SuSE
821    // kernel (see 4821821 for details). We first expand stack to the top
822    // of yellow zone, then enable stack yellow zone (order is significant,
823    // enabling yellow zone first will crash JVM on SuSE Linux), so there
824    // is no gap between the last two virtual memory regions.
825
826    JavaThread *jt = (JavaThread *)thread;
827    address addr = jt->stack_reserved_zone_base();
828    assert(addr != NULL, "initialization problem?");
829    assert(jt->stack_available(addr) > 0, "stack guard should not be enabled");
830
831    osthread->set_expanding_stack();
832    os::Linux::manually_expand_stack(jt, addr);
833    osthread->clear_expanding_stack();
834  }
835
836  // initialize signal mask for this thread
837  // and save the caller's signal mask
838  os::Linux::hotspot_sigmask(thread);
839
840  log_info(os, thread)("Thread attached (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
841    os::current_thread_id(), (uintx) pthread_self());
842
843  return true;
844}
845
846void os::pd_start_thread(Thread* thread) {
847  OSThread * osthread = thread->osthread();
848  assert(osthread->get_state() != INITIALIZED, "just checking");
849  Monitor* sync_with_child = osthread->startThread_lock();
850  MutexLockerEx ml(sync_with_child, Mutex::_no_safepoint_check_flag);
851  sync_with_child->notify();
852}
853
854// Free Linux resources related to the OSThread
855void os::free_thread(OSThread* osthread) {
856  assert(osthread != NULL, "osthread not set");
857
858  // We are told to free resources of the argument thread,
859  // but we can only really operate on the current thread.
860  assert(Thread::current()->osthread() == osthread,
861         "os::free_thread but not current thread");
862
863#ifdef ASSERT
864  sigset_t current;
865  sigemptyset(&current);
866  pthread_sigmask(SIG_SETMASK, NULL, &current);
867  assert(!sigismember(&current, SR_signum), "SR signal should not be blocked!");
868#endif
869
870  // Restore caller's signal mask
871  sigset_t sigmask = osthread->caller_sigmask();
872  pthread_sigmask(SIG_SETMASK, &sigmask, NULL);
873
874  delete osthread;
875}
876
877//////////////////////////////////////////////////////////////////////////////
878// initial thread
879
880// Check if current thread is the initial thread, similar to Solaris thr_main.
881bool os::Linux::is_initial_thread(void) {
882  char dummy;
883  // If called before init complete, thread stack bottom will be null.
884  // Can be called if fatal error occurs before initialization.
885  if (initial_thread_stack_bottom() == NULL) return false;
886  assert(initial_thread_stack_bottom() != NULL &&
887         initial_thread_stack_size()   != 0,
888         "os::init did not locate initial thread's stack region");
889  if ((address)&dummy >= initial_thread_stack_bottom() &&
890      (address)&dummy < initial_thread_stack_bottom() + initial_thread_stack_size()) {
891    return true;
892  } else {
893    return false;
894  }
895}
896
897// Find the virtual memory area that contains addr
898static bool find_vma(address addr, address* vma_low, address* vma_high) {
899  FILE *fp = fopen("/proc/self/maps", "r");
900  if (fp) {
901    address low, high;
902    while (!feof(fp)) {
903      if (fscanf(fp, "%p-%p", &low, &high) == 2) {
904        if (low <= addr && addr < high) {
905          if (vma_low)  *vma_low  = low;
906          if (vma_high) *vma_high = high;
907          fclose(fp);
908          return true;
909        }
910      }
911      for (;;) {
912        int ch = fgetc(fp);
913        if (ch == EOF || ch == (int)'\n') break;
914      }
915    }
916    fclose(fp);
917  }
918  return false;
919}
920
921// Locate initial thread stack. This special handling of initial thread stack
922// is needed because pthread_getattr_np() on most (all?) Linux distros returns
923// bogus value for the primordial process thread. While the launcher has created
924// the VM in a new thread since JDK 6, we still have to allow for the use of the
925// JNI invocation API from a primordial thread.
926void os::Linux::capture_initial_stack(size_t max_size) {
927
928  // max_size is either 0 (which means accept OS default for thread stacks) or
929  // a user-specified value known to be at least the minimum needed. If we
930  // are actually on the primordial thread we can make it appear that we have a
931  // smaller max_size stack by inserting the guard pages at that location. But we
932  // cannot do anything to emulate a larger stack than what has been provided by
933  // the OS or threading library. In fact if we try to use a stack greater than
934  // what is set by rlimit then we will crash the hosting process.
935
936  // Maximum stack size is the easy part, get it from RLIMIT_STACK.
937  // If this is "unlimited" then it will be a huge value.
938  struct rlimit rlim;
939  getrlimit(RLIMIT_STACK, &rlim);
940  size_t stack_size = rlim.rlim_cur;
941
942  // 6308388: a bug in ld.so will relocate its own .data section to the
943  //   lower end of primordial stack; reduce ulimit -s value a little bit
944  //   so we won't install guard page on ld.so's data section.
945  stack_size -= 2 * page_size();
946
947  // Try to figure out where the stack base (top) is. This is harder.
948  //
949  // When an application is started, glibc saves the initial stack pointer in
950  // a global variable "__libc_stack_end", which is then used by system
951  // libraries. __libc_stack_end should be pretty close to stack top. The
952  // variable is available since the very early days. However, because it is
953  // a private interface, it could disappear in the future.
954  //
955  // Linux kernel saves start_stack information in /proc/<pid>/stat. Similar
956  // to __libc_stack_end, it is very close to stack top, but isn't the real
957  // stack top. Note that /proc may not exist if VM is running as a chroot
958  // program, so reading /proc/<pid>/stat could fail. Also the contents of
959  // /proc/<pid>/stat could change in the future (though unlikely).
960  //
961  // We try __libc_stack_end first. If that doesn't work, look for
962  // /proc/<pid>/stat. If neither of them works, we use current stack pointer
963  // as a hint, which should work well in most cases.
964
965  uintptr_t stack_start;
966
967  // try __libc_stack_end first
968  uintptr_t *p = (uintptr_t *)dlsym(RTLD_DEFAULT, "__libc_stack_end");
969  if (p && *p) {
970    stack_start = *p;
971  } else {
972    // see if we can get the start_stack field from /proc/self/stat
973    FILE *fp;
974    int pid;
975    char state;
976    int ppid;
977    int pgrp;
978    int session;
979    int nr;
980    int tpgrp;
981    unsigned long flags;
982    unsigned long minflt;
983    unsigned long cminflt;
984    unsigned long majflt;
985    unsigned long cmajflt;
986    unsigned long utime;
987    unsigned long stime;
988    long cutime;
989    long cstime;
990    long prio;
991    long nice;
992    long junk;
993    long it_real;
994    uintptr_t start;
995    uintptr_t vsize;
996    intptr_t rss;
997    uintptr_t rsslim;
998    uintptr_t scodes;
999    uintptr_t ecode;
1000    int i;
1001
1002    // Figure what the primordial thread stack base is. Code is inspired
1003    // by email from Hans Boehm. /proc/self/stat begins with current pid,
1004    // followed by command name surrounded by parentheses, state, etc.
1005    char stat[2048];
1006    int statlen;
1007
1008    fp = fopen("/proc/self/stat", "r");
1009    if (fp) {
1010      statlen = fread(stat, 1, 2047, fp);
1011      stat[statlen] = '\0';
1012      fclose(fp);
1013
1014      // Skip pid and the command string. Note that we could be dealing with
1015      // weird command names, e.g. user could decide to rename java launcher
1016      // to "java 1.4.2 :)", then the stat file would look like
1017      //                1234 (java 1.4.2 :)) R ... ...
1018      // We don't really need to know the command string, just find the last
1019      // occurrence of ")" and then start parsing from there. See bug 4726580.
1020      char * s = strrchr(stat, ')');
1021
1022      i = 0;
1023      if (s) {
1024        // Skip blank chars
1025        do { s++; } while (s && isspace(*s));
1026
1027#define _UFM UINTX_FORMAT
1028#define _DFM INTX_FORMAT
1029
1030        //                                     1   1   1   1   1   1   1   1   1   1   2   2    2    2    2    2    2    2    2
1031        //              3  4  5  6  7  8   9   0   1   2   3   4   5   6   7   8   9   0   1    2    3    4    5    6    7    8
1032        i = sscanf(s, "%c %d %d %d %d %d %lu %lu %lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld " _UFM _UFM _DFM _UFM _UFM _UFM _UFM,
1033                   &state,          // 3  %c
1034                   &ppid,           // 4  %d
1035                   &pgrp,           // 5  %d
1036                   &session,        // 6  %d
1037                   &nr,             // 7  %d
1038                   &tpgrp,          // 8  %d
1039                   &flags,          // 9  %lu
1040                   &minflt,         // 10 %lu
1041                   &cminflt,        // 11 %lu
1042                   &majflt,         // 12 %lu
1043                   &cmajflt,        // 13 %lu
1044                   &utime,          // 14 %lu
1045                   &stime,          // 15 %lu
1046                   &cutime,         // 16 %ld
1047                   &cstime,         // 17 %ld
1048                   &prio,           // 18 %ld
1049                   &nice,           // 19 %ld
1050                   &junk,           // 20 %ld
1051                   &it_real,        // 21 %ld
1052                   &start,          // 22 UINTX_FORMAT
1053                   &vsize,          // 23 UINTX_FORMAT
1054                   &rss,            // 24 INTX_FORMAT
1055                   &rsslim,         // 25 UINTX_FORMAT
1056                   &scodes,         // 26 UINTX_FORMAT
1057                   &ecode,          // 27 UINTX_FORMAT
1058                   &stack_start);   // 28 UINTX_FORMAT
1059      }
1060
1061#undef _UFM
1062#undef _DFM
1063
1064      if (i != 28 - 2) {
1065        assert(false, "Bad conversion from /proc/self/stat");
1066        // product mode - assume we are the initial thread, good luck in the
1067        // embedded case.
1068        warning("Can't detect initial thread stack location - bad conversion");
1069        stack_start = (uintptr_t) &rlim;
1070      }
1071    } else {
1072      // For some reason we can't open /proc/self/stat (for example, running on
1073      // FreeBSD with a Linux emulator, or inside chroot), this should work for
1074      // most cases, so don't abort:
1075      warning("Can't detect initial thread stack location - no /proc/self/stat");
1076      stack_start = (uintptr_t) &rlim;
1077    }
1078  }
1079
1080  // Now we have a pointer (stack_start) very close to the stack top, the
1081  // next thing to do is to figure out the exact location of stack top. We
1082  // can find out the virtual memory area that contains stack_start by
1083  // reading /proc/self/maps, it should be the last vma in /proc/self/maps,
1084  // and its upper limit is the real stack top. (again, this would fail if
1085  // running inside chroot, because /proc may not exist.)
1086
1087  uintptr_t stack_top;
1088  address low, high;
1089  if (find_vma((address)stack_start, &low, &high)) {
1090    // success, "high" is the true stack top. (ignore "low", because initial
1091    // thread stack grows on demand, its real bottom is high - RLIMIT_STACK.)
1092    stack_top = (uintptr_t)high;
1093  } else {
1094    // failed, likely because /proc/self/maps does not exist
1095    warning("Can't detect initial thread stack location - find_vma failed");
1096    // best effort: stack_start is normally within a few pages below the real
1097    // stack top, use it as stack top, and reduce stack size so we won't put
1098    // guard page outside stack.
1099    stack_top = stack_start;
1100    stack_size -= 16 * page_size();
1101  }
1102
1103  // stack_top could be partially down the page so align it
1104  stack_top = align_size_up(stack_top, page_size());
1105
1106  // Allowed stack value is minimum of max_size and what we derived from rlimit
1107  if (max_size > 0) {
1108    _initial_thread_stack_size = MIN2(max_size, stack_size);
1109  } else {
1110    // Accept the rlimit max, but if stack is unlimited then it will be huge, so
1111    // clamp it at 8MB as we do on Solaris
1112    _initial_thread_stack_size = MIN2(stack_size, 8*M);
1113  }
1114  _initial_thread_stack_size = align_size_down(_initial_thread_stack_size, page_size());
1115  _initial_thread_stack_bottom = (address)stack_top - _initial_thread_stack_size;
1116
1117  assert(_initial_thread_stack_bottom < (address)stack_top, "overflow!");
1118
1119  if (log_is_enabled(Info, os, thread)) {
1120    // See if we seem to be on primordial process thread
1121    bool primordial = uintptr_t(&rlim) > uintptr_t(_initial_thread_stack_bottom) &&
1122                      uintptr_t(&rlim) < stack_top;
1123
1124    log_info(os, thread)("Capturing initial stack in %s thread: req. size: " SIZE_FORMAT "K, actual size: "
1125                         SIZE_FORMAT "K, top=" INTPTR_FORMAT ", bottom=" INTPTR_FORMAT,
1126                         primordial ? "primordial" : "user", max_size / K,  _initial_thread_stack_size / K,
1127                         stack_top, intptr_t(_initial_thread_stack_bottom));
1128  }
1129}
1130
1131////////////////////////////////////////////////////////////////////////////////
1132// time support
1133
1134// Time since start-up in seconds to a fine granularity.
1135// Used by VMSelfDestructTimer and the MemProfiler.
1136double os::elapsedTime() {
1137
1138  return ((double)os::elapsed_counter()) / os::elapsed_frequency(); // nanosecond resolution
1139}
1140
1141jlong os::elapsed_counter() {
1142  return javaTimeNanos() - initial_time_count;
1143}
1144
1145jlong os::elapsed_frequency() {
1146  return NANOSECS_PER_SEC; // nanosecond resolution
1147}
1148
1149bool os::supports_vtime() { return true; }
1150bool os::enable_vtime()   { return false; }
1151bool os::vtime_enabled()  { return false; }
1152
1153double os::elapsedVTime() {
1154  struct rusage usage;
1155  int retval = getrusage(RUSAGE_THREAD, &usage);
1156  if (retval == 0) {
1157    return (double) (usage.ru_utime.tv_sec + usage.ru_stime.tv_sec) + (double) (usage.ru_utime.tv_usec + usage.ru_stime.tv_usec) / (1000 * 1000);
1158  } else {
1159    // better than nothing, but not much
1160    return elapsedTime();
1161  }
1162}
1163
1164jlong os::javaTimeMillis() {
1165  timeval time;
1166  int status = gettimeofday(&time, NULL);
1167  assert(status != -1, "linux error");
1168  return jlong(time.tv_sec) * 1000  +  jlong(time.tv_usec / 1000);
1169}
1170
1171void os::javaTimeSystemUTC(jlong &seconds, jlong &nanos) {
1172  timeval time;
1173  int status = gettimeofday(&time, NULL);
1174  assert(status != -1, "linux error");
1175  seconds = jlong(time.tv_sec);
1176  nanos = jlong(time.tv_usec) * 1000;
1177}
1178
1179
1180#ifndef CLOCK_MONOTONIC
1181  #define CLOCK_MONOTONIC (1)
1182#endif
1183
1184void os::Linux::clock_init() {
1185  // we do dlopen's in this particular order due to bug in linux
1186  // dynamical loader (see 6348968) leading to crash on exit
1187  void* handle = dlopen("librt.so.1", RTLD_LAZY);
1188  if (handle == NULL) {
1189    handle = dlopen("librt.so", RTLD_LAZY);
1190  }
1191
1192  if (handle) {
1193    int (*clock_getres_func)(clockid_t, struct timespec*) =
1194           (int(*)(clockid_t, struct timespec*))dlsym(handle, "clock_getres");
1195    int (*clock_gettime_func)(clockid_t, struct timespec*) =
1196           (int(*)(clockid_t, struct timespec*))dlsym(handle, "clock_gettime");
1197    if (clock_getres_func && clock_gettime_func) {
1198      // See if monotonic clock is supported by the kernel. Note that some
1199      // early implementations simply return kernel jiffies (updated every
1200      // 1/100 or 1/1000 second). It would be bad to use such a low res clock
1201      // for nano time (though the monotonic property is still nice to have).
1202      // It's fixed in newer kernels, however clock_getres() still returns
1203      // 1/HZ. We check if clock_getres() works, but will ignore its reported
1204      // resolution for now. Hopefully as people move to new kernels, this
1205      // won't be a problem.
1206      struct timespec res;
1207      struct timespec tp;
1208      if (clock_getres_func (CLOCK_MONOTONIC, &res) == 0 &&
1209          clock_gettime_func(CLOCK_MONOTONIC, &tp)  == 0) {
1210        // yes, monotonic clock is supported
1211        _clock_gettime = clock_gettime_func;
1212        return;
1213      } else {
1214        // close librt if there is no monotonic clock
1215        dlclose(handle);
1216      }
1217    }
1218  }
1219  warning("No monotonic clock was available - timed services may " \
1220          "be adversely affected if the time-of-day clock changes");
1221}
1222
1223#ifndef SYS_clock_getres
1224  #if defined(X86) || defined(PPC64) || defined(S390)
1225    #define SYS_clock_getres AMD64_ONLY(229) IA32_ONLY(266) PPC64_ONLY(247) S390_ONLY(261)
1226    #define sys_clock_getres(x,y)  ::syscall(SYS_clock_getres, x, y)
1227  #else
1228    #warning "SYS_clock_getres not defined for this platform, disabling fast_thread_cpu_time"
1229    #define sys_clock_getres(x,y)  -1
1230  #endif
1231#else
1232  #define sys_clock_getres(x,y)  ::syscall(SYS_clock_getres, x, y)
1233#endif
1234
1235void os::Linux::fast_thread_clock_init() {
1236  if (!UseLinuxPosixThreadCPUClocks) {
1237    return;
1238  }
1239  clockid_t clockid;
1240  struct timespec tp;
1241  int (*pthread_getcpuclockid_func)(pthread_t, clockid_t *) =
1242      (int(*)(pthread_t, clockid_t *)) dlsym(RTLD_DEFAULT, "pthread_getcpuclockid");
1243
1244  // Switch to using fast clocks for thread cpu time if
1245  // the sys_clock_getres() returns 0 error code.
1246  // Note, that some kernels may support the current thread
1247  // clock (CLOCK_THREAD_CPUTIME_ID) but not the clocks
1248  // returned by the pthread_getcpuclockid().
1249  // If the fast Posix clocks are supported then the sys_clock_getres()
1250  // must return at least tp.tv_sec == 0 which means a resolution
1251  // better than 1 sec. This is extra check for reliability.
1252
1253  if (pthread_getcpuclockid_func &&
1254      pthread_getcpuclockid_func(_main_thread, &clockid) == 0 &&
1255      sys_clock_getres(clockid, &tp) == 0 && tp.tv_sec == 0) {
1256    _supports_fast_thread_cpu_time = true;
1257    _pthread_getcpuclockid = pthread_getcpuclockid_func;
1258  }
1259}
1260
1261jlong os::javaTimeNanos() {
1262  if (os::supports_monotonic_clock()) {
1263    struct timespec tp;
1264    int status = Linux::clock_gettime(CLOCK_MONOTONIC, &tp);
1265    assert(status == 0, "gettime error");
1266    jlong result = jlong(tp.tv_sec) * (1000 * 1000 * 1000) + jlong(tp.tv_nsec);
1267    return result;
1268  } else {
1269    timeval time;
1270    int status = gettimeofday(&time, NULL);
1271    assert(status != -1, "linux error");
1272    jlong usecs = jlong(time.tv_sec) * (1000 * 1000) + jlong(time.tv_usec);
1273    return 1000 * usecs;
1274  }
1275}
1276
1277void os::javaTimeNanos_info(jvmtiTimerInfo *info_ptr) {
1278  if (os::supports_monotonic_clock()) {
1279    info_ptr->max_value = ALL_64_BITS;
1280
1281    // CLOCK_MONOTONIC - amount of time since some arbitrary point in the past
1282    info_ptr->may_skip_backward = false;      // not subject to resetting or drifting
1283    info_ptr->may_skip_forward = false;       // not subject to resetting or drifting
1284  } else {
1285    // gettimeofday - based on time in seconds since the Epoch thus does not wrap
1286    info_ptr->max_value = ALL_64_BITS;
1287
1288    // gettimeofday is a real time clock so it skips
1289    info_ptr->may_skip_backward = true;
1290    info_ptr->may_skip_forward = true;
1291  }
1292
1293  info_ptr->kind = JVMTI_TIMER_ELAPSED;                // elapsed not CPU time
1294}
1295
1296// Return the real, user, and system times in seconds from an
1297// arbitrary fixed point in the past.
1298bool os::getTimesSecs(double* process_real_time,
1299                      double* process_user_time,
1300                      double* process_system_time) {
1301  struct tms ticks;
1302  clock_t real_ticks = times(&ticks);
1303
1304  if (real_ticks == (clock_t) (-1)) {
1305    return false;
1306  } else {
1307    double ticks_per_second = (double) clock_tics_per_sec;
1308    *process_user_time = ((double) ticks.tms_utime) / ticks_per_second;
1309    *process_system_time = ((double) ticks.tms_stime) / ticks_per_second;
1310    *process_real_time = ((double) real_ticks) / ticks_per_second;
1311
1312    return true;
1313  }
1314}
1315
1316
1317char * os::local_time_string(char *buf, size_t buflen) {
1318  struct tm t;
1319  time_t long_time;
1320  time(&long_time);
1321  localtime_r(&long_time, &t);
1322  jio_snprintf(buf, buflen, "%d-%02d-%02d %02d:%02d:%02d",
1323               t.tm_year + 1900, t.tm_mon + 1, t.tm_mday,
1324               t.tm_hour, t.tm_min, t.tm_sec);
1325  return buf;
1326}
1327
1328struct tm* os::localtime_pd(const time_t* clock, struct tm*  res) {
1329  return localtime_r(clock, res);
1330}
1331
1332////////////////////////////////////////////////////////////////////////////////
1333// runtime exit support
1334
1335// Note: os::shutdown() might be called very early during initialization, or
1336// called from signal handler. Before adding something to os::shutdown(), make
1337// sure it is async-safe and can handle partially initialized VM.
1338void os::shutdown() {
1339
1340  // allow PerfMemory to attempt cleanup of any persistent resources
1341  perfMemory_exit();
1342
1343  // needs to remove object in file system
1344  AttachListener::abort();
1345
1346  // flush buffered output, finish log files
1347  ostream_abort();
1348
1349  // Check for abort hook
1350  abort_hook_t abort_hook = Arguments::abort_hook();
1351  if (abort_hook != NULL) {
1352    abort_hook();
1353  }
1354
1355}
1356
1357// Note: os::abort() might be called very early during initialization, or
1358// called from signal handler. Before adding something to os::abort(), make
1359// sure it is async-safe and can handle partially initialized VM.
1360void os::abort(bool dump_core, void* siginfo, const void* context) {
1361  os::shutdown();
1362  if (dump_core) {
1363#ifndef PRODUCT
1364    fdStream out(defaultStream::output_fd());
1365    out.print_raw("Current thread is ");
1366    char buf[16];
1367    jio_snprintf(buf, sizeof(buf), UINTX_FORMAT, os::current_thread_id());
1368    out.print_raw_cr(buf);
1369    out.print_raw_cr("Dumping core ...");
1370#endif
1371    ::abort(); // dump core
1372  }
1373
1374  ::exit(1);
1375}
1376
1377// Die immediately, no exit hook, no abort hook, no cleanup.
1378void os::die() {
1379  ::abort();
1380}
1381
1382
1383// This method is a copy of JDK's sysGetLastErrorString
1384// from src/solaris/hpi/src/system_md.c
1385
1386size_t os::lasterror(char *buf, size_t len) {
1387  if (errno == 0)  return 0;
1388
1389  const char *s = os::strerror(errno);
1390  size_t n = ::strlen(s);
1391  if (n >= len) {
1392    n = len - 1;
1393  }
1394  ::strncpy(buf, s, n);
1395  buf[n] = '\0';
1396  return n;
1397}
1398
1399// thread_id is kernel thread id (similar to Solaris LWP id)
1400intx os::current_thread_id() { return os::Linux::gettid(); }
1401int os::current_process_id() {
1402  return ::getpid();
1403}
1404
1405// DLL functions
1406
1407const char* os::dll_file_extension() { return ".so"; }
1408
1409// This must be hard coded because it's the system's temporary
1410// directory not the java application's temp directory, ala java.io.tmpdir.
1411const char* os::get_temp_directory() { return "/tmp"; }
1412
1413static bool file_exists(const char* filename) {
1414  struct stat statbuf;
1415  if (filename == NULL || strlen(filename) == 0) {
1416    return false;
1417  }
1418  return os::stat(filename, &statbuf) == 0;
1419}
1420
1421bool os::dll_build_name(char* buffer, size_t buflen,
1422                        const char* pname, const char* fname) {
1423  bool retval = false;
1424  // Copied from libhpi
1425  const size_t pnamelen = pname ? strlen(pname) : 0;
1426
1427  // Return error on buffer overflow.
1428  if (pnamelen + strlen(fname) + 10 > (size_t) buflen) {
1429    return retval;
1430  }
1431
1432  if (pnamelen == 0) {
1433    snprintf(buffer, buflen, "lib%s.so", fname);
1434    retval = true;
1435  } else if (strchr(pname, *os::path_separator()) != NULL) {
1436    int n;
1437    char** pelements = split_path(pname, &n);
1438    if (pelements == NULL) {
1439      return false;
1440    }
1441    for (int i = 0; i < n; i++) {
1442      // Really shouldn't be NULL, but check can't hurt
1443      if (pelements[i] == NULL || strlen(pelements[i]) == 0) {
1444        continue; // skip the empty path values
1445      }
1446      snprintf(buffer, buflen, "%s/lib%s.so", pelements[i], fname);
1447      if (file_exists(buffer)) {
1448        retval = true;
1449        break;
1450      }
1451    }
1452    // release the storage
1453    for (int i = 0; i < n; i++) {
1454      if (pelements[i] != NULL) {
1455        FREE_C_HEAP_ARRAY(char, pelements[i]);
1456      }
1457    }
1458    if (pelements != NULL) {
1459      FREE_C_HEAP_ARRAY(char*, pelements);
1460    }
1461  } else {
1462    snprintf(buffer, buflen, "%s/lib%s.so", pname, fname);
1463    retval = true;
1464  }
1465  return retval;
1466}
1467
1468// check if addr is inside libjvm.so
1469bool os::address_is_in_vm(address addr) {
1470  static address libjvm_base_addr;
1471  Dl_info dlinfo;
1472
1473  if (libjvm_base_addr == NULL) {
1474    if (dladdr(CAST_FROM_FN_PTR(void *, os::address_is_in_vm), &dlinfo) != 0) {
1475      libjvm_base_addr = (address)dlinfo.dli_fbase;
1476    }
1477    assert(libjvm_base_addr !=NULL, "Cannot obtain base address for libjvm");
1478  }
1479
1480  if (dladdr((void *)addr, &dlinfo) != 0) {
1481    if (libjvm_base_addr == (address)dlinfo.dli_fbase) return true;
1482  }
1483
1484  return false;
1485}
1486
1487bool os::dll_address_to_function_name(address addr, char *buf,
1488                                      int buflen, int *offset,
1489                                      bool demangle) {
1490  // buf is not optional, but offset is optional
1491  assert(buf != NULL, "sanity check");
1492
1493  Dl_info dlinfo;
1494
1495  if (dladdr((void*)addr, &dlinfo) != 0) {
1496    // see if we have a matching symbol
1497    if (dlinfo.dli_saddr != NULL && dlinfo.dli_sname != NULL) {
1498      if (!(demangle && Decoder::demangle(dlinfo.dli_sname, buf, buflen))) {
1499        jio_snprintf(buf, buflen, "%s", dlinfo.dli_sname);
1500      }
1501      if (offset != NULL) *offset = addr - (address)dlinfo.dli_saddr;
1502      return true;
1503    }
1504    // no matching symbol so try for just file info
1505    if (dlinfo.dli_fname != NULL && dlinfo.dli_fbase != NULL) {
1506      if (Decoder::decode((address)(addr - (address)dlinfo.dli_fbase),
1507                          buf, buflen, offset, dlinfo.dli_fname, demangle)) {
1508        return true;
1509      }
1510    }
1511  }
1512
1513  buf[0] = '\0';
1514  if (offset != NULL) *offset = -1;
1515  return false;
1516}
1517
1518struct _address_to_library_name {
1519  address addr;          // input : memory address
1520  size_t  buflen;        //         size of fname
1521  char*   fname;         // output: library name
1522  address base;          //         library base addr
1523};
1524
1525static int address_to_library_name_callback(struct dl_phdr_info *info,
1526                                            size_t size, void *data) {
1527  int i;
1528  bool found = false;
1529  address libbase = NULL;
1530  struct _address_to_library_name * d = (struct _address_to_library_name *)data;
1531
1532  // iterate through all loadable segments
1533  for (i = 0; i < info->dlpi_phnum; i++) {
1534    address segbase = (address)(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr);
1535    if (info->dlpi_phdr[i].p_type == PT_LOAD) {
1536      // base address of a library is the lowest address of its loaded
1537      // segments.
1538      if (libbase == NULL || libbase > segbase) {
1539        libbase = segbase;
1540      }
1541      // see if 'addr' is within current segment
1542      if (segbase <= d->addr &&
1543          d->addr < segbase + info->dlpi_phdr[i].p_memsz) {
1544        found = true;
1545      }
1546    }
1547  }
1548
1549  // dlpi_name is NULL or empty if the ELF file is executable, return 0
1550  // so dll_address_to_library_name() can fall through to use dladdr() which
1551  // can figure out executable name from argv[0].
1552  if (found && info->dlpi_name && info->dlpi_name[0]) {
1553    d->base = libbase;
1554    if (d->fname) {
1555      jio_snprintf(d->fname, d->buflen, "%s", info->dlpi_name);
1556    }
1557    return 1;
1558  }
1559  return 0;
1560}
1561
1562bool os::dll_address_to_library_name(address addr, char* buf,
1563                                     int buflen, int* offset) {
1564  // buf is not optional, but offset is optional
1565  assert(buf != NULL, "sanity check");
1566
1567  Dl_info dlinfo;
1568  struct _address_to_library_name data;
1569
1570  // There is a bug in old glibc dladdr() implementation that it could resolve
1571  // to wrong library name if the .so file has a base address != NULL. Here
1572  // we iterate through the program headers of all loaded libraries to find
1573  // out which library 'addr' really belongs to. This workaround can be
1574  // removed once the minimum requirement for glibc is moved to 2.3.x.
1575  data.addr = addr;
1576  data.fname = buf;
1577  data.buflen = buflen;
1578  data.base = NULL;
1579  int rslt = dl_iterate_phdr(address_to_library_name_callback, (void *)&data);
1580
1581  if (rslt) {
1582    // buf already contains library name
1583    if (offset) *offset = addr - data.base;
1584    return true;
1585  }
1586  if (dladdr((void*)addr, &dlinfo) != 0) {
1587    if (dlinfo.dli_fname != NULL) {
1588      jio_snprintf(buf, buflen, "%s", dlinfo.dli_fname);
1589    }
1590    if (dlinfo.dli_fbase != NULL && offset != NULL) {
1591      *offset = addr - (address)dlinfo.dli_fbase;
1592    }
1593    return true;
1594  }
1595
1596  buf[0] = '\0';
1597  if (offset) *offset = -1;
1598  return false;
1599}
1600
1601// Loads .dll/.so and
1602// in case of error it checks if .dll/.so was built for the
1603// same architecture as Hotspot is running on
1604
1605
1606// Remember the stack's state. The Linux dynamic linker will change
1607// the stack to 'executable' at most once, so we must safepoint only once.
1608bool os::Linux::_stack_is_executable = false;
1609
1610// VM operation that loads a library.  This is necessary if stack protection
1611// of the Java stacks can be lost during loading the library.  If we
1612// do not stop the Java threads, they can stack overflow before the stacks
1613// are protected again.
1614class VM_LinuxDllLoad: public VM_Operation {
1615 private:
1616  const char *_filename;
1617  char *_ebuf;
1618  int _ebuflen;
1619  void *_lib;
1620 public:
1621  VM_LinuxDllLoad(const char *fn, char *ebuf, int ebuflen) :
1622    _filename(fn), _ebuf(ebuf), _ebuflen(ebuflen), _lib(NULL) {}
1623  VMOp_Type type() const { return VMOp_LinuxDllLoad; }
1624  void doit() {
1625    _lib = os::Linux::dll_load_in_vmthread(_filename, _ebuf, _ebuflen);
1626    os::Linux::_stack_is_executable = true;
1627  }
1628  void* loaded_library() { return _lib; }
1629};
1630
1631void * os::dll_load(const char *filename, char *ebuf, int ebuflen) {
1632  void * result = NULL;
1633  bool load_attempted = false;
1634
1635  // Check whether the library to load might change execution rights
1636  // of the stack. If they are changed, the protection of the stack
1637  // guard pages will be lost. We need a safepoint to fix this.
1638  //
1639  // See Linux man page execstack(8) for more info.
1640  if (os::uses_stack_guard_pages() && !os::Linux::_stack_is_executable) {
1641    ElfFile ef(filename);
1642    if (!ef.specifies_noexecstack()) {
1643      if (!is_init_completed()) {
1644        os::Linux::_stack_is_executable = true;
1645        // This is OK - No Java threads have been created yet, and hence no
1646        // stack guard pages to fix.
1647        //
1648        // This should happen only when you are building JDK7 using a very
1649        // old version of JDK6 (e.g., with JPRT) and running test_gamma.
1650        //
1651        // Dynamic loader will make all stacks executable after
1652        // this function returns, and will not do that again.
1653        assert(Threads::first() == NULL, "no Java threads should exist yet.");
1654      } else {
1655        warning("You have loaded library %s which might have disabled stack guard. "
1656                "The VM will try to fix the stack guard now.\n"
1657                "It's highly recommended that you fix the library with "
1658                "'execstack -c <libfile>', or link it with '-z noexecstack'.",
1659                filename);
1660
1661        assert(Thread::current()->is_Java_thread(), "must be Java thread");
1662        JavaThread *jt = JavaThread::current();
1663        if (jt->thread_state() != _thread_in_native) {
1664          // This happens when a compiler thread tries to load a hsdis-<arch>.so file
1665          // that requires ExecStack. Cannot enter safe point. Let's give up.
1666          warning("Unable to fix stack guard. Giving up.");
1667        } else {
1668          if (!LoadExecStackDllInVMThread) {
1669            // This is for the case where the DLL has an static
1670            // constructor function that executes JNI code. We cannot
1671            // load such DLLs in the VMThread.
1672            result = os::Linux::dlopen_helper(filename, ebuf, ebuflen);
1673          }
1674
1675          ThreadInVMfromNative tiv(jt);
1676          debug_only(VMNativeEntryWrapper vew;)
1677
1678          VM_LinuxDllLoad op(filename, ebuf, ebuflen);
1679          VMThread::execute(&op);
1680          if (LoadExecStackDllInVMThread) {
1681            result = op.loaded_library();
1682          }
1683          load_attempted = true;
1684        }
1685      }
1686    }
1687  }
1688
1689  if (!load_attempted) {
1690    result = os::Linux::dlopen_helper(filename, ebuf, ebuflen);
1691  }
1692
1693  if (result != NULL) {
1694    // Successful loading
1695    return result;
1696  }
1697
1698  Elf32_Ehdr elf_head;
1699  int diag_msg_max_length=ebuflen-strlen(ebuf);
1700  char* diag_msg_buf=ebuf+strlen(ebuf);
1701
1702  if (diag_msg_max_length==0) {
1703    // No more space in ebuf for additional diagnostics message
1704    return NULL;
1705  }
1706
1707
1708  int file_descriptor= ::open(filename, O_RDONLY | O_NONBLOCK);
1709
1710  if (file_descriptor < 0) {
1711    // Can't open library, report dlerror() message
1712    return NULL;
1713  }
1714
1715  bool failed_to_read_elf_head=
1716    (sizeof(elf_head)!=
1717     (::read(file_descriptor, &elf_head,sizeof(elf_head))));
1718
1719  ::close(file_descriptor);
1720  if (failed_to_read_elf_head) {
1721    // file i/o error - report dlerror() msg
1722    return NULL;
1723  }
1724
1725  typedef struct {
1726    Elf32_Half    code;         // Actual value as defined in elf.h
1727    Elf32_Half    compat_class; // Compatibility of archs at VM's sense
1728    unsigned char elf_class;    // 32 or 64 bit
1729    unsigned char endianess;    // MSB or LSB
1730    char*         name;         // String representation
1731  } arch_t;
1732
1733#ifndef EM_486
1734  #define EM_486          6               /* Intel 80486 */
1735#endif
1736#ifndef EM_AARCH64
1737  #define EM_AARCH64    183               /* ARM AARCH64 */
1738#endif
1739
1740  static const arch_t arch_array[]={
1741    {EM_386,         EM_386,     ELFCLASS32, ELFDATA2LSB, (char*)"IA 32"},
1742    {EM_486,         EM_386,     ELFCLASS32, ELFDATA2LSB, (char*)"IA 32"},
1743    {EM_IA_64,       EM_IA_64,   ELFCLASS64, ELFDATA2LSB, (char*)"IA 64"},
1744    {EM_X86_64,      EM_X86_64,  ELFCLASS64, ELFDATA2LSB, (char*)"AMD 64"},
1745    {EM_SPARC,       EM_SPARC,   ELFCLASS32, ELFDATA2MSB, (char*)"Sparc 32"},
1746    {EM_SPARC32PLUS, EM_SPARC,   ELFCLASS32, ELFDATA2MSB, (char*)"Sparc 32"},
1747    {EM_SPARCV9,     EM_SPARCV9, ELFCLASS64, ELFDATA2MSB, (char*)"Sparc v9 64"},
1748    {EM_PPC,         EM_PPC,     ELFCLASS32, ELFDATA2MSB, (char*)"Power PC 32"},
1749#if defined(VM_LITTLE_ENDIAN)
1750    {EM_PPC64,       EM_PPC64,   ELFCLASS64, ELFDATA2LSB, (char*)"Power PC 64"},
1751#else
1752    {EM_PPC64,       EM_PPC64,   ELFCLASS64, ELFDATA2MSB, (char*)"Power PC 64 LE"},
1753#endif
1754    {EM_ARM,         EM_ARM,     ELFCLASS32,   ELFDATA2LSB, (char*)"ARM"},
1755    {EM_S390,        EM_S390,    ELFCLASSNONE, ELFDATA2MSB, (char*)"IBM System/390"},
1756    {EM_ALPHA,       EM_ALPHA,   ELFCLASS64, ELFDATA2LSB, (char*)"Alpha"},
1757    {EM_MIPS_RS3_LE, EM_MIPS_RS3_LE, ELFCLASS32, ELFDATA2LSB, (char*)"MIPSel"},
1758    {EM_MIPS,        EM_MIPS,    ELFCLASS32, ELFDATA2MSB, (char*)"MIPS"},
1759    {EM_PARISC,      EM_PARISC,  ELFCLASS32, ELFDATA2MSB, (char*)"PARISC"},
1760    {EM_68K,         EM_68K,     ELFCLASS32, ELFDATA2MSB, (char*)"M68k"},
1761    {EM_AARCH64,     EM_AARCH64, ELFCLASS64, ELFDATA2LSB, (char*)"AARCH64"},
1762  };
1763
1764#if  (defined IA32)
1765  static  Elf32_Half running_arch_code=EM_386;
1766#elif   (defined AMD64)
1767  static  Elf32_Half running_arch_code=EM_X86_64;
1768#elif  (defined IA64)
1769  static  Elf32_Half running_arch_code=EM_IA_64;
1770#elif  (defined __sparc) && (defined _LP64)
1771  static  Elf32_Half running_arch_code=EM_SPARCV9;
1772#elif  (defined __sparc) && (!defined _LP64)
1773  static  Elf32_Half running_arch_code=EM_SPARC;
1774#elif  (defined __powerpc64__)
1775  static  Elf32_Half running_arch_code=EM_PPC64;
1776#elif  (defined __powerpc__)
1777  static  Elf32_Half running_arch_code=EM_PPC;
1778#elif  (defined AARCH64)
1779  static  Elf32_Half running_arch_code=EM_AARCH64;
1780#elif  (defined ARM)
1781  static  Elf32_Half running_arch_code=EM_ARM;
1782#elif  (defined S390)
1783  static  Elf32_Half running_arch_code=EM_S390;
1784#elif  (defined ALPHA)
1785  static  Elf32_Half running_arch_code=EM_ALPHA;
1786#elif  (defined MIPSEL)
1787  static  Elf32_Half running_arch_code=EM_MIPS_RS3_LE;
1788#elif  (defined PARISC)
1789  static  Elf32_Half running_arch_code=EM_PARISC;
1790#elif  (defined MIPS)
1791  static  Elf32_Half running_arch_code=EM_MIPS;
1792#elif  (defined M68K)
1793  static  Elf32_Half running_arch_code=EM_68K;
1794#else
1795    #error Method os::dll_load requires that one of following is defined:\
1796        AARCH64, ALPHA, ARM, AMD64, IA32, IA64, M68K, MIPS, MIPSEL, PARISC, __powerpc__, __powerpc64__, S390, __sparc
1797#endif
1798
1799  // Identify compatability class for VM's architecture and library's architecture
1800  // Obtain string descriptions for architectures
1801
1802  arch_t lib_arch={elf_head.e_machine,0,elf_head.e_ident[EI_CLASS], elf_head.e_ident[EI_DATA], NULL};
1803  int running_arch_index=-1;
1804
1805  for (unsigned int i=0; i < ARRAY_SIZE(arch_array); i++) {
1806    if (running_arch_code == arch_array[i].code) {
1807      running_arch_index    = i;
1808    }
1809    if (lib_arch.code == arch_array[i].code) {
1810      lib_arch.compat_class = arch_array[i].compat_class;
1811      lib_arch.name         = arch_array[i].name;
1812    }
1813  }
1814
1815  assert(running_arch_index != -1,
1816         "Didn't find running architecture code (running_arch_code) in arch_array");
1817  if (running_arch_index == -1) {
1818    // Even though running architecture detection failed
1819    // we may still continue with reporting dlerror() message
1820    return NULL;
1821  }
1822
1823  if (lib_arch.endianess != arch_array[running_arch_index].endianess) {
1824    ::snprintf(diag_msg_buf, diag_msg_max_length-1," (Possible cause: endianness mismatch)");
1825    return NULL;
1826  }
1827
1828#ifndef S390
1829  if (lib_arch.elf_class != arch_array[running_arch_index].elf_class) {
1830    ::snprintf(diag_msg_buf, diag_msg_max_length-1," (Possible cause: architecture word width mismatch)");
1831    return NULL;
1832  }
1833#endif // !S390
1834
1835  if (lib_arch.compat_class != arch_array[running_arch_index].compat_class) {
1836    if (lib_arch.name!=NULL) {
1837      ::snprintf(diag_msg_buf, diag_msg_max_length-1,
1838                 " (Possible cause: can't load %s-bit .so on a %s-bit platform)",
1839                 lib_arch.name, arch_array[running_arch_index].name);
1840    } else {
1841      ::snprintf(diag_msg_buf, diag_msg_max_length-1,
1842                 " (Possible cause: can't load this .so (machine code=0x%x) on a %s-bit platform)",
1843                 lib_arch.code,
1844                 arch_array[running_arch_index].name);
1845    }
1846  }
1847
1848  return NULL;
1849}
1850
1851void * os::Linux::dlopen_helper(const char *filename, char *ebuf,
1852                                int ebuflen) {
1853  void * result = ::dlopen(filename, RTLD_LAZY);
1854  if (result == NULL) {
1855    ::strncpy(ebuf, ::dlerror(), ebuflen - 1);
1856    ebuf[ebuflen-1] = '\0';
1857  }
1858  return result;
1859}
1860
1861void * os::Linux::dll_load_in_vmthread(const char *filename, char *ebuf,
1862                                       int ebuflen) {
1863  void * result = NULL;
1864  if (LoadExecStackDllInVMThread) {
1865    result = dlopen_helper(filename, ebuf, ebuflen);
1866  }
1867
1868  // Since 7019808, libjvm.so is linked with -noexecstack. If the VM loads a
1869  // library that requires an executable stack, or which does not have this
1870  // stack attribute set, dlopen changes the stack attribute to executable. The
1871  // read protection of the guard pages gets lost.
1872  //
1873  // Need to check _stack_is_executable again as multiple VM_LinuxDllLoad
1874  // may have been queued at the same time.
1875
1876  if (!_stack_is_executable) {
1877    JavaThread *jt = Threads::first();
1878
1879    while (jt) {
1880      if (!jt->stack_guard_zone_unused() &&     // Stack not yet fully initialized
1881          jt->stack_guards_enabled()) {         // No pending stack overflow exceptions
1882        if (!os::guard_memory((char *)jt->stack_end(), jt->stack_guard_zone_size())) {
1883          warning("Attempt to reguard stack yellow zone failed.");
1884        }
1885      }
1886      jt = jt->next();
1887    }
1888  }
1889
1890  return result;
1891}
1892
1893void* os::dll_lookup(void* handle, const char* name) {
1894  void* res = dlsym(handle, name);
1895  return res;
1896}
1897
1898void* os::get_default_process_handle() {
1899  return (void*)::dlopen(NULL, RTLD_LAZY);
1900}
1901
1902static bool _print_ascii_file(const char* filename, outputStream* st) {
1903  int fd = ::open(filename, O_RDONLY);
1904  if (fd == -1) {
1905    return false;
1906  }
1907
1908  char buf[33];
1909  int bytes;
1910  buf[32] = '\0';
1911  while ((bytes = ::read(fd, buf, sizeof(buf)-1)) > 0) {
1912    st->print_raw(buf, bytes);
1913  }
1914
1915  ::close(fd);
1916
1917  return true;
1918}
1919
1920void os::print_dll_info(outputStream *st) {
1921  st->print_cr("Dynamic libraries:");
1922
1923  char fname[32];
1924  pid_t pid = os::Linux::gettid();
1925
1926  jio_snprintf(fname, sizeof(fname), "/proc/%d/maps", pid);
1927
1928  if (!_print_ascii_file(fname, st)) {
1929    st->print("Can not get library information for pid = %d\n", pid);
1930  }
1931}
1932
1933int os::get_loaded_modules_info(os::LoadedModulesCallbackFunc callback, void *param) {
1934  FILE *procmapsFile = NULL;
1935
1936  // Open the procfs maps file for the current process
1937  if ((procmapsFile = fopen("/proc/self/maps", "r")) != NULL) {
1938    // Allocate PATH_MAX for file name plus a reasonable size for other fields.
1939    char line[PATH_MAX + 100];
1940
1941    // Read line by line from 'file'
1942    while (fgets(line, sizeof(line), procmapsFile) != NULL) {
1943      u8 base, top, offset, inode;
1944      char permissions[5];
1945      char device[6];
1946      char name[PATH_MAX + 1];
1947
1948      // Parse fields from line
1949      sscanf(line, UINT64_FORMAT_X "-" UINT64_FORMAT_X " %4s " UINT64_FORMAT_X " %5s " INT64_FORMAT " %s",
1950             &base, &top, permissions, &offset, device, &inode, name);
1951
1952      // Filter by device id '00:00' so that we only get file system mapped files.
1953      if (strcmp(device, "00:00") != 0) {
1954
1955        // Call callback with the fields of interest
1956        if(callback(name, (address)base, (address)top, param)) {
1957          // Oops abort, callback aborted
1958          fclose(procmapsFile);
1959          return 1;
1960        }
1961      }
1962    }
1963    fclose(procmapsFile);
1964  }
1965  return 0;
1966}
1967
1968void os::print_os_info_brief(outputStream* st) {
1969  os::Linux::print_distro_info(st);
1970
1971  os::Posix::print_uname_info(st);
1972
1973  os::Linux::print_libversion_info(st);
1974
1975}
1976
1977void os::print_os_info(outputStream* st) {
1978  st->print("OS:");
1979
1980  os::Linux::print_distro_info(st);
1981
1982  os::Posix::print_uname_info(st);
1983
1984  // Print warning if unsafe chroot environment detected
1985  if (unsafe_chroot_detected) {
1986    st->print("WARNING!! ");
1987    st->print_cr("%s", unstable_chroot_error);
1988  }
1989
1990  os::Linux::print_libversion_info(st);
1991
1992  os::Posix::print_rlimit_info(st);
1993
1994  os::Posix::print_load_average(st);
1995
1996  os::Linux::print_full_memory_info(st);
1997}
1998
1999// Try to identify popular distros.
2000// Most Linux distributions have a /etc/XXX-release file, which contains
2001// the OS version string. Newer Linux distributions have a /etc/lsb-release
2002// file that also contains the OS version string. Some have more than one
2003// /etc/XXX-release file (e.g. Mandrake has both /etc/mandrake-release and
2004// /etc/redhat-release.), so the order is important.
2005// Any Linux that is based on Redhat (i.e. Oracle, Mandrake, Sun JDS...) have
2006// their own specific XXX-release file as well as a redhat-release file.
2007// Because of this the XXX-release file needs to be searched for before the
2008// redhat-release file.
2009// Since Red Hat and SuSE have an lsb-release file that is not very descriptive the
2010// search for redhat-release / SuSE-release needs to be before lsb-release.
2011// Since the lsb-release file is the new standard it needs to be searched
2012// before the older style release files.
2013// Searching system-release (Red Hat) and os-release (other Linuxes) are a
2014// next to last resort.  The os-release file is a new standard that contains
2015// distribution information and the system-release file seems to be an old
2016// standard that has been replaced by the lsb-release and os-release files.
2017// Searching for the debian_version file is the last resort.  It contains
2018// an informative string like "6.0.6" or "wheezy/sid". Because of this
2019// "Debian " is printed before the contents of the debian_version file.
2020
2021const char* distro_files[] = {
2022  "/etc/oracle-release",
2023  "/etc/mandriva-release",
2024  "/etc/mandrake-release",
2025  "/etc/sun-release",
2026  "/etc/redhat-release",
2027  "/etc/SuSE-release",
2028  "/etc/lsb-release",
2029  "/etc/turbolinux-release",
2030  "/etc/gentoo-release",
2031  "/etc/ltib-release",
2032  "/etc/angstrom-version",
2033  "/etc/system-release",
2034  "/etc/os-release",
2035  NULL };
2036
2037void os::Linux::print_distro_info(outputStream* st) {
2038  for (int i = 0;; i++) {
2039    const char* file = distro_files[i];
2040    if (file == NULL) {
2041      break;  // done
2042    }
2043    // If file prints, we found it.
2044    if (_print_ascii_file(file, st)) {
2045      return;
2046    }
2047  }
2048
2049  if (file_exists("/etc/debian_version")) {
2050    st->print("Debian ");
2051    _print_ascii_file("/etc/debian_version", st);
2052  } else {
2053    st->print("Linux");
2054  }
2055  st->cr();
2056}
2057
2058static void parse_os_info_helper(FILE* fp, char* distro, size_t length, bool get_first_line) {
2059  char buf[256];
2060  while (fgets(buf, sizeof(buf), fp)) {
2061    // Edit out extra stuff in expected format
2062    if (strstr(buf, "DISTRIB_DESCRIPTION=") != NULL || strstr(buf, "PRETTY_NAME=") != NULL) {
2063      char* ptr = strstr(buf, "\"");  // the name is in quotes
2064      if (ptr != NULL) {
2065        ptr++; // go beyond first quote
2066        char* nl = strchr(ptr, '\"');
2067        if (nl != NULL) *nl = '\0';
2068        strncpy(distro, ptr, length);
2069      } else {
2070        ptr = strstr(buf, "=");
2071        ptr++; // go beyond equals then
2072        char* nl = strchr(ptr, '\n');
2073        if (nl != NULL) *nl = '\0';
2074        strncpy(distro, ptr, length);
2075      }
2076      return;
2077    } else if (get_first_line) {
2078      char* nl = strchr(buf, '\n');
2079      if (nl != NULL) *nl = '\0';
2080      strncpy(distro, buf, length);
2081      return;
2082    }
2083  }
2084  // print last line and close
2085  char* nl = strchr(buf, '\n');
2086  if (nl != NULL) *nl = '\0';
2087  strncpy(distro, buf, length);
2088}
2089
2090static void parse_os_info(char* distro, size_t length, const char* file) {
2091  FILE* fp = fopen(file, "r");
2092  if (fp != NULL) {
2093    // if suse format, print out first line
2094    bool get_first_line = (strcmp(file, "/etc/SuSE-release") == 0);
2095    parse_os_info_helper(fp, distro, length, get_first_line);
2096    fclose(fp);
2097  }
2098}
2099
2100void os::get_summary_os_info(char* buf, size_t buflen) {
2101  for (int i = 0;; i++) {
2102    const char* file = distro_files[i];
2103    if (file == NULL) {
2104      break; // ran out of distro_files
2105    }
2106    if (file_exists(file)) {
2107      parse_os_info(buf, buflen, file);
2108      return;
2109    }
2110  }
2111  // special case for debian
2112  if (file_exists("/etc/debian_version")) {
2113    strncpy(buf, "Debian ", buflen);
2114    parse_os_info(&buf[7], buflen-7, "/etc/debian_version");
2115  } else {
2116    strncpy(buf, "Linux", buflen);
2117  }
2118}
2119
2120void os::Linux::print_libversion_info(outputStream* st) {
2121  // libc, pthread
2122  st->print("libc:");
2123  st->print("%s ", os::Linux::glibc_version());
2124  st->print("%s ", os::Linux::libpthread_version());
2125  st->cr();
2126}
2127
2128void os::Linux::print_full_memory_info(outputStream* st) {
2129  st->print("\n/proc/meminfo:\n");
2130  _print_ascii_file("/proc/meminfo", st);
2131  st->cr();
2132}
2133
2134void os::print_memory_info(outputStream* st) {
2135
2136  st->print("Memory:");
2137  st->print(" %dk page", os::vm_page_size()>>10);
2138
2139  // values in struct sysinfo are "unsigned long"
2140  struct sysinfo si;
2141  sysinfo(&si);
2142
2143  st->print(", physical " UINT64_FORMAT "k",
2144            os::physical_memory() >> 10);
2145  st->print("(" UINT64_FORMAT "k free)",
2146            os::available_memory() >> 10);
2147  st->print(", swap " UINT64_FORMAT "k",
2148            ((jlong)si.totalswap * si.mem_unit) >> 10);
2149  st->print("(" UINT64_FORMAT "k free)",
2150            ((jlong)si.freeswap * si.mem_unit) >> 10);
2151  st->cr();
2152}
2153
2154// Print the first "model name" line and the first "flags" line
2155// that we find and nothing more. We assume "model name" comes
2156// before "flags" so if we find a second "model name", then the
2157// "flags" field is considered missing.
2158static bool print_model_name_and_flags(outputStream* st, char* buf, size_t buflen) {
2159#if defined(IA32) || defined(AMD64)
2160  // Other platforms have less repetitive cpuinfo files
2161  FILE *fp = fopen("/proc/cpuinfo", "r");
2162  if (fp) {
2163    while (!feof(fp)) {
2164      if (fgets(buf, buflen, fp)) {
2165        // Assume model name comes before flags
2166        bool model_name_printed = false;
2167        if (strstr(buf, "model name") != NULL) {
2168          if (!model_name_printed) {
2169            st->print_raw("CPU Model and flags from /proc/cpuinfo:\n");
2170            st->print_raw(buf);
2171            model_name_printed = true;
2172          } else {
2173            // model name printed but not flags?  Odd, just return
2174            fclose(fp);
2175            return true;
2176          }
2177        }
2178        // print the flags line too
2179        if (strstr(buf, "flags") != NULL) {
2180          st->print_raw(buf);
2181          fclose(fp);
2182          return true;
2183        }
2184      }
2185    }
2186    fclose(fp);
2187  }
2188#endif // x86 platforms
2189  return false;
2190}
2191
2192void os::pd_print_cpu_info(outputStream* st, char* buf, size_t buflen) {
2193  // Only print the model name if the platform provides this as a summary
2194  if (!print_model_name_and_flags(st, buf, buflen)) {
2195    st->print("\n/proc/cpuinfo:\n");
2196    if (!_print_ascii_file("/proc/cpuinfo", st)) {
2197      st->print_cr("  <Not Available>");
2198    }
2199  }
2200}
2201
2202#if defined(AMD64) || defined(IA32) || defined(X32)
2203const char* search_string = "model name";
2204#elif defined(PPC64)
2205const char* search_string = "cpu";
2206#elif defined(S390)
2207const char* search_string = "processor";
2208#elif defined(SPARC)
2209const char* search_string = "cpu";
2210#else
2211const char* search_string = "Processor";
2212#endif
2213
2214// Parses the cpuinfo file for string representing the model name.
2215void os::get_summary_cpu_info(char* cpuinfo, size_t length) {
2216  FILE* fp = fopen("/proc/cpuinfo", "r");
2217  if (fp != NULL) {
2218    while (!feof(fp)) {
2219      char buf[256];
2220      if (fgets(buf, sizeof(buf), fp)) {
2221        char* start = strstr(buf, search_string);
2222        if (start != NULL) {
2223          char *ptr = start + strlen(search_string);
2224          char *end = buf + strlen(buf);
2225          while (ptr != end) {
2226             // skip whitespace and colon for the rest of the name.
2227             if (*ptr != ' ' && *ptr != '\t' && *ptr != ':') {
2228               break;
2229             }
2230             ptr++;
2231          }
2232          if (ptr != end) {
2233            // reasonable string, get rid of newline and keep the rest
2234            char* nl = strchr(buf, '\n');
2235            if (nl != NULL) *nl = '\0';
2236            strncpy(cpuinfo, ptr, length);
2237            fclose(fp);
2238            return;
2239          }
2240        }
2241      }
2242    }
2243    fclose(fp);
2244  }
2245  // cpuinfo not found or parsing failed, just print generic string.  The entire
2246  // /proc/cpuinfo file will be printed later in the file (or enough of it for x86)
2247#if   defined(AARCH64)
2248  strncpy(cpuinfo, "AArch64", length);
2249#elif defined(AMD64)
2250  strncpy(cpuinfo, "x86_64", length);
2251#elif defined(ARM)  // Order wrt. AARCH64 is relevant!
2252  strncpy(cpuinfo, "ARM", length);
2253#elif defined(IA32)
2254  strncpy(cpuinfo, "x86_32", length);
2255#elif defined(IA64)
2256  strncpy(cpuinfo, "IA64", length);
2257#elif defined(PPC)
2258  strncpy(cpuinfo, "PPC64", length);
2259#elif defined(S390)
2260  strncpy(cpuinfo, "S390", length);
2261#elif defined(SPARC)
2262  strncpy(cpuinfo, "sparcv9", length);
2263#elif defined(ZERO_LIBARCH)
2264  strncpy(cpuinfo, ZERO_LIBARCH, length);
2265#else
2266  strncpy(cpuinfo, "unknown", length);
2267#endif
2268}
2269
2270static void print_signal_handler(outputStream* st, int sig,
2271                                 char* buf, size_t buflen);
2272
2273void os::print_signal_handlers(outputStream* st, char* buf, size_t buflen) {
2274  st->print_cr("Signal Handlers:");
2275  print_signal_handler(st, SIGSEGV, buf, buflen);
2276  print_signal_handler(st, SIGBUS , buf, buflen);
2277  print_signal_handler(st, SIGFPE , buf, buflen);
2278  print_signal_handler(st, SIGPIPE, buf, buflen);
2279  print_signal_handler(st, SIGXFSZ, buf, buflen);
2280  print_signal_handler(st, SIGILL , buf, buflen);
2281  print_signal_handler(st, SR_signum, buf, buflen);
2282  print_signal_handler(st, SHUTDOWN1_SIGNAL, buf, buflen);
2283  print_signal_handler(st, SHUTDOWN2_SIGNAL , buf, buflen);
2284  print_signal_handler(st, SHUTDOWN3_SIGNAL , buf, buflen);
2285  print_signal_handler(st, BREAK_SIGNAL, buf, buflen);
2286#if defined(PPC64)
2287  print_signal_handler(st, SIGTRAP, buf, buflen);
2288#endif
2289}
2290
2291static char saved_jvm_path[MAXPATHLEN] = {0};
2292
2293// Find the full path to the current module, libjvm.so
2294void os::jvm_path(char *buf, jint buflen) {
2295  // Error checking.
2296  if (buflen < MAXPATHLEN) {
2297    assert(false, "must use a large-enough buffer");
2298    buf[0] = '\0';
2299    return;
2300  }
2301  // Lazy resolve the path to current module.
2302  if (saved_jvm_path[0] != 0) {
2303    strcpy(buf, saved_jvm_path);
2304    return;
2305  }
2306
2307  char dli_fname[MAXPATHLEN];
2308  bool ret = dll_address_to_library_name(
2309                                         CAST_FROM_FN_PTR(address, os::jvm_path),
2310                                         dli_fname, sizeof(dli_fname), NULL);
2311  assert(ret, "cannot locate libjvm");
2312  char *rp = NULL;
2313  if (ret && dli_fname[0] != '\0') {
2314    rp = os::Posix::realpath(dli_fname, buf, buflen);
2315  }
2316  if (rp == NULL) {
2317    return;
2318  }
2319
2320  if (Arguments::sun_java_launcher_is_altjvm()) {
2321    // Support for the java launcher's '-XXaltjvm=<path>' option. Typical
2322    // value for buf is "<JAVA_HOME>/jre/lib/<vmtype>/libjvm.so".
2323    // If "/jre/lib/" appears at the right place in the string, then
2324    // assume we are installed in a JDK and we're done. Otherwise, check
2325    // for a JAVA_HOME environment variable and fix up the path so it
2326    // looks like libjvm.so is installed there (append a fake suffix
2327    // hotspot/libjvm.so).
2328    const char *p = buf + strlen(buf) - 1;
2329    for (int count = 0; p > buf && count < 5; ++count) {
2330      for (--p; p > buf && *p != '/'; --p)
2331        /* empty */ ;
2332    }
2333
2334    if (strncmp(p, "/jre/lib/", 9) != 0) {
2335      // Look for JAVA_HOME in the environment.
2336      char* java_home_var = ::getenv("JAVA_HOME");
2337      if (java_home_var != NULL && java_home_var[0] != 0) {
2338        char* jrelib_p;
2339        int len;
2340
2341        // Check the current module name "libjvm.so".
2342        p = strrchr(buf, '/');
2343        if (p == NULL) {
2344          return;
2345        }
2346        assert(strstr(p, "/libjvm") == p, "invalid library name");
2347
2348        rp = os::Posix::realpath(java_home_var, buf, buflen);
2349        if (rp == NULL) {
2350          return;
2351        }
2352
2353        // determine if this is a legacy image or modules image
2354        // modules image doesn't have "jre" subdirectory
2355        len = strlen(buf);
2356        assert(len < buflen, "Ran out of buffer room");
2357        jrelib_p = buf + len;
2358        snprintf(jrelib_p, buflen-len, "/jre/lib");
2359        if (0 != access(buf, F_OK)) {
2360          snprintf(jrelib_p, buflen-len, "/lib");
2361        }
2362
2363        if (0 == access(buf, F_OK)) {
2364          // Use current module name "libjvm.so"
2365          len = strlen(buf);
2366          snprintf(buf + len, buflen-len, "/hotspot/libjvm.so");
2367        } else {
2368          // Go back to path of .so
2369          rp = os::Posix::realpath(dli_fname, buf, buflen);
2370          if (rp == NULL) {
2371            return;
2372          }
2373        }
2374      }
2375    }
2376  }
2377
2378  strncpy(saved_jvm_path, buf, MAXPATHLEN);
2379  saved_jvm_path[MAXPATHLEN - 1] = '\0';
2380}
2381
2382void os::print_jni_name_prefix_on(outputStream* st, int args_size) {
2383  // no prefix required, not even "_"
2384}
2385
2386void os::print_jni_name_suffix_on(outputStream* st, int args_size) {
2387  // no suffix required
2388}
2389
2390////////////////////////////////////////////////////////////////////////////////
2391// sun.misc.Signal support
2392
2393static volatile jint sigint_count = 0;
2394
2395static void UserHandler(int sig, void *siginfo, void *context) {
2396  // 4511530 - sem_post is serialized and handled by the manager thread. When
2397  // the program is interrupted by Ctrl-C, SIGINT is sent to every thread. We
2398  // don't want to flood the manager thread with sem_post requests.
2399  if (sig == SIGINT && Atomic::add(1, &sigint_count) > 1) {
2400    return;
2401  }
2402
2403  // Ctrl-C is pressed during error reporting, likely because the error
2404  // handler fails to abort. Let VM die immediately.
2405  if (sig == SIGINT && is_error_reported()) {
2406    os::die();
2407  }
2408
2409  os::signal_notify(sig);
2410}
2411
2412void* os::user_handler() {
2413  return CAST_FROM_FN_PTR(void*, UserHandler);
2414}
2415
2416struct timespec PosixSemaphore::create_timespec(unsigned int sec, int nsec) {
2417  struct timespec ts;
2418  // Semaphore's are always associated with CLOCK_REALTIME
2419  os::Linux::clock_gettime(CLOCK_REALTIME, &ts);
2420  // see unpackTime for discussion on overflow checking
2421  if (sec >= MAX_SECS) {
2422    ts.tv_sec += MAX_SECS;
2423    ts.tv_nsec = 0;
2424  } else {
2425    ts.tv_sec += sec;
2426    ts.tv_nsec += nsec;
2427    if (ts.tv_nsec >= NANOSECS_PER_SEC) {
2428      ts.tv_nsec -= NANOSECS_PER_SEC;
2429      ++ts.tv_sec; // note: this must be <= max_secs
2430    }
2431  }
2432
2433  return ts;
2434}
2435
2436extern "C" {
2437  typedef void (*sa_handler_t)(int);
2438  typedef void (*sa_sigaction_t)(int, siginfo_t *, void *);
2439}
2440
2441void* os::signal(int signal_number, void* handler) {
2442  struct sigaction sigAct, oldSigAct;
2443
2444  sigfillset(&(sigAct.sa_mask));
2445  sigAct.sa_flags   = SA_RESTART|SA_SIGINFO;
2446  sigAct.sa_handler = CAST_TO_FN_PTR(sa_handler_t, handler);
2447
2448  if (sigaction(signal_number, &sigAct, &oldSigAct)) {
2449    // -1 means registration failed
2450    return (void *)-1;
2451  }
2452
2453  return CAST_FROM_FN_PTR(void*, oldSigAct.sa_handler);
2454}
2455
2456void os::signal_raise(int signal_number) {
2457  ::raise(signal_number);
2458}
2459
2460// The following code is moved from os.cpp for making this
2461// code platform specific, which it is by its very nature.
2462
2463// Will be modified when max signal is changed to be dynamic
2464int os::sigexitnum_pd() {
2465  return NSIG;
2466}
2467
2468// a counter for each possible signal value
2469static volatile jint pending_signals[NSIG+1] = { 0 };
2470
2471// Linux(POSIX) specific hand shaking semaphore.
2472static sem_t sig_sem;
2473static PosixSemaphore sr_semaphore;
2474
2475void os::signal_init_pd() {
2476  // Initialize signal structures
2477  ::memset((void*)pending_signals, 0, sizeof(pending_signals));
2478
2479  // Initialize signal semaphore
2480  ::sem_init(&sig_sem, 0, 0);
2481}
2482
2483void os::signal_notify(int sig) {
2484  Atomic::inc(&pending_signals[sig]);
2485  ::sem_post(&sig_sem);
2486}
2487
2488static int check_pending_signals(bool wait) {
2489  Atomic::store(0, &sigint_count);
2490  for (;;) {
2491    for (int i = 0; i < NSIG + 1; i++) {
2492      jint n = pending_signals[i];
2493      if (n > 0 && n == Atomic::cmpxchg(n - 1, &pending_signals[i], n)) {
2494        return i;
2495      }
2496    }
2497    if (!wait) {
2498      return -1;
2499    }
2500    JavaThread *thread = JavaThread::current();
2501    ThreadBlockInVM tbivm(thread);
2502
2503    bool threadIsSuspended;
2504    do {
2505      thread->set_suspend_equivalent();
2506      // cleared by handle_special_suspend_equivalent_condition() or java_suspend_self()
2507      ::sem_wait(&sig_sem);
2508
2509      // were we externally suspended while we were waiting?
2510      threadIsSuspended = thread->handle_special_suspend_equivalent_condition();
2511      if (threadIsSuspended) {
2512        // The semaphore has been incremented, but while we were waiting
2513        // another thread suspended us. We don't want to continue running
2514        // while suspended because that would surprise the thread that
2515        // suspended us.
2516        ::sem_post(&sig_sem);
2517
2518        thread->java_suspend_self();
2519      }
2520    } while (threadIsSuspended);
2521  }
2522}
2523
2524int os::signal_lookup() {
2525  return check_pending_signals(false);
2526}
2527
2528int os::signal_wait() {
2529  return check_pending_signals(true);
2530}
2531
2532////////////////////////////////////////////////////////////////////////////////
2533// Virtual Memory
2534
2535int os::vm_page_size() {
2536  // Seems redundant as all get out
2537  assert(os::Linux::page_size() != -1, "must call os::init");
2538  return os::Linux::page_size();
2539}
2540
2541// Solaris allocates memory by pages.
2542int os::vm_allocation_granularity() {
2543  assert(os::Linux::page_size() != -1, "must call os::init");
2544  return os::Linux::page_size();
2545}
2546
2547// Rationale behind this function:
2548//  current (Mon Apr 25 20:12:18 MSD 2005) oprofile drops samples without executable
2549//  mapping for address (see lookup_dcookie() in the kernel module), thus we cannot get
2550//  samples for JITted code. Here we create private executable mapping over the code cache
2551//  and then we can use standard (well, almost, as mapping can change) way to provide
2552//  info for the reporting script by storing timestamp and location of symbol
2553void linux_wrap_code(char* base, size_t size) {
2554  static volatile jint cnt = 0;
2555
2556  if (!UseOprofile) {
2557    return;
2558  }
2559
2560  char buf[PATH_MAX+1];
2561  int num = Atomic::add(1, &cnt);
2562
2563  snprintf(buf, sizeof(buf), "%s/hs-vm-%d-%d",
2564           os::get_temp_directory(), os::current_process_id(), num);
2565  unlink(buf);
2566
2567  int fd = ::open(buf, O_CREAT | O_RDWR, S_IRWXU);
2568
2569  if (fd != -1) {
2570    off_t rv = ::lseek(fd, size-2, SEEK_SET);
2571    if (rv != (off_t)-1) {
2572      if (::write(fd, "", 1) == 1) {
2573        mmap(base, size,
2574             PROT_READ|PROT_WRITE|PROT_EXEC,
2575             MAP_PRIVATE|MAP_FIXED|MAP_NORESERVE, fd, 0);
2576      }
2577    }
2578    ::close(fd);
2579    unlink(buf);
2580  }
2581}
2582
2583static bool recoverable_mmap_error(int err) {
2584  // See if the error is one we can let the caller handle. This
2585  // list of errno values comes from JBS-6843484. I can't find a
2586  // Linux man page that documents this specific set of errno
2587  // values so while this list currently matches Solaris, it may
2588  // change as we gain experience with this failure mode.
2589  switch (err) {
2590  case EBADF:
2591  case EINVAL:
2592  case ENOTSUP:
2593    // let the caller deal with these errors
2594    return true;
2595
2596  default:
2597    // Any remaining errors on this OS can cause our reserved mapping
2598    // to be lost. That can cause confusion where different data
2599    // structures think they have the same memory mapped. The worst
2600    // scenario is if both the VM and a library think they have the
2601    // same memory mapped.
2602    return false;
2603  }
2604}
2605
2606static void warn_fail_commit_memory(char* addr, size_t size, bool exec,
2607                                    int err) {
2608  warning("INFO: os::commit_memory(" PTR_FORMAT ", " SIZE_FORMAT
2609          ", %d) failed; error='%s' (errno=%d)", p2i(addr), size, exec,
2610          os::strerror(err), err);
2611}
2612
2613static void warn_fail_commit_memory(char* addr, size_t size,
2614                                    size_t alignment_hint, bool exec,
2615                                    int err) {
2616  warning("INFO: os::commit_memory(" PTR_FORMAT ", " SIZE_FORMAT
2617          ", " SIZE_FORMAT ", %d) failed; error='%s' (errno=%d)", p2i(addr), size,
2618          alignment_hint, exec, os::strerror(err), err);
2619}
2620
2621// NOTE: Linux kernel does not really reserve the pages for us.
2622//       All it does is to check if there are enough free pages
2623//       left at the time of mmap(). This could be a potential
2624//       problem.
2625int os::Linux::commit_memory_impl(char* addr, size_t size, bool exec) {
2626  int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
2627  uintptr_t res = (uintptr_t) ::mmap(addr, size, prot,
2628                                     MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
2629  if (res != (uintptr_t) MAP_FAILED) {
2630    if (UseNUMAInterleaving) {
2631      numa_make_global(addr, size);
2632    }
2633    return 0;
2634  }
2635
2636  int err = errno;  // save errno from mmap() call above
2637
2638  if (!recoverable_mmap_error(err)) {
2639    warn_fail_commit_memory(addr, size, exec, err);
2640    vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "committing reserved memory.");
2641  }
2642
2643  return err;
2644}
2645
2646bool os::pd_commit_memory(char* addr, size_t size, bool exec) {
2647  return os::Linux::commit_memory_impl(addr, size, exec) == 0;
2648}
2649
2650void os::pd_commit_memory_or_exit(char* addr, size_t size, bool exec,
2651                                  const char* mesg) {
2652  assert(mesg != NULL, "mesg must be specified");
2653  int err = os::Linux::commit_memory_impl(addr, size, exec);
2654  if (err != 0) {
2655    // the caller wants all commit errors to exit with the specified mesg:
2656    warn_fail_commit_memory(addr, size, exec, err);
2657    vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "%s", mesg);
2658  }
2659}
2660
2661// Define MAP_HUGETLB here so we can build HotSpot on old systems.
2662#ifndef MAP_HUGETLB
2663  #define MAP_HUGETLB 0x40000
2664#endif
2665
2666// Define MADV_HUGEPAGE here so we can build HotSpot on old systems.
2667#ifndef MADV_HUGEPAGE
2668  #define MADV_HUGEPAGE 14
2669#endif
2670
2671int os::Linux::commit_memory_impl(char* addr, size_t size,
2672                                  size_t alignment_hint, bool exec) {
2673  int err = os::Linux::commit_memory_impl(addr, size, exec);
2674  if (err == 0) {
2675    realign_memory(addr, size, alignment_hint);
2676  }
2677  return err;
2678}
2679
2680bool os::pd_commit_memory(char* addr, size_t size, size_t alignment_hint,
2681                          bool exec) {
2682  return os::Linux::commit_memory_impl(addr, size, alignment_hint, exec) == 0;
2683}
2684
2685void os::pd_commit_memory_or_exit(char* addr, size_t size,
2686                                  size_t alignment_hint, bool exec,
2687                                  const char* mesg) {
2688  assert(mesg != NULL, "mesg must be specified");
2689  int err = os::Linux::commit_memory_impl(addr, size, alignment_hint, exec);
2690  if (err != 0) {
2691    // the caller wants all commit errors to exit with the specified mesg:
2692    warn_fail_commit_memory(addr, size, alignment_hint, exec, err);
2693    vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "%s", mesg);
2694  }
2695}
2696
2697void os::pd_realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
2698  if (UseTransparentHugePages && alignment_hint > (size_t)vm_page_size()) {
2699    // We don't check the return value: madvise(MADV_HUGEPAGE) may not
2700    // be supported or the memory may already be backed by huge pages.
2701    ::madvise(addr, bytes, MADV_HUGEPAGE);
2702  }
2703}
2704
2705void os::pd_free_memory(char *addr, size_t bytes, size_t alignment_hint) {
2706  // This method works by doing an mmap over an existing mmaping and effectively discarding
2707  // the existing pages. However it won't work for SHM-based large pages that cannot be
2708  // uncommitted at all. We don't do anything in this case to avoid creating a segment with
2709  // small pages on top of the SHM segment. This method always works for small pages, so we
2710  // allow that in any case.
2711  if (alignment_hint <= (size_t)os::vm_page_size() || can_commit_large_page_memory()) {
2712    commit_memory(addr, bytes, alignment_hint, !ExecMem);
2713  }
2714}
2715
2716void os::numa_make_global(char *addr, size_t bytes) {
2717  Linux::numa_interleave_memory(addr, bytes);
2718}
2719
2720// Define for numa_set_bind_policy(int). Setting the argument to 0 will set the
2721// bind policy to MPOL_PREFERRED for the current thread.
2722#define USE_MPOL_PREFERRED 0
2723
2724void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) {
2725  // To make NUMA and large pages more robust when both enabled, we need to ease
2726  // the requirements on where the memory should be allocated. MPOL_BIND is the
2727  // default policy and it will force memory to be allocated on the specified
2728  // node. Changing this to MPOL_PREFERRED will prefer to allocate the memory on
2729  // the specified node, but will not force it. Using this policy will prevent
2730  // getting SIGBUS when trying to allocate large pages on NUMA nodes with no
2731  // free large pages.
2732  Linux::numa_set_bind_policy(USE_MPOL_PREFERRED);
2733  Linux::numa_tonode_memory(addr, bytes, lgrp_hint);
2734}
2735
2736bool os::numa_topology_changed() { return false; }
2737
2738size_t os::numa_get_groups_num() {
2739  // Return just the number of nodes in which it's possible to allocate memory
2740  // (in numa terminology, configured nodes).
2741  return Linux::numa_num_configured_nodes();
2742}
2743
2744int os::numa_get_group_id() {
2745  int cpu_id = Linux::sched_getcpu();
2746  if (cpu_id != -1) {
2747    int lgrp_id = Linux::get_node_by_cpu(cpu_id);
2748    if (lgrp_id != -1) {
2749      return lgrp_id;
2750    }
2751  }
2752  return 0;
2753}
2754
2755int os::Linux::get_existing_num_nodes() {
2756  size_t node;
2757  size_t highest_node_number = Linux::numa_max_node();
2758  int num_nodes = 0;
2759
2760  // Get the total number of nodes in the system including nodes without memory.
2761  for (node = 0; node <= highest_node_number; node++) {
2762    if (isnode_in_existing_nodes(node)) {
2763      num_nodes++;
2764    }
2765  }
2766  return num_nodes;
2767}
2768
2769size_t os::numa_get_leaf_groups(int *ids, size_t size) {
2770  size_t highest_node_number = Linux::numa_max_node();
2771  size_t i = 0;
2772
2773  // Map all node ids in which is possible to allocate memory. Also nodes are
2774  // not always consecutively available, i.e. available from 0 to the highest
2775  // node number.
2776  for (size_t node = 0; node <= highest_node_number; node++) {
2777    if (Linux::isnode_in_configured_nodes(node)) {
2778      ids[i++] = node;
2779    }
2780  }
2781  return i;
2782}
2783
2784bool os::get_page_info(char *start, page_info* info) {
2785  return false;
2786}
2787
2788char *os::scan_pages(char *start, char* end, page_info* page_expected,
2789                     page_info* page_found) {
2790  return end;
2791}
2792
2793
2794int os::Linux::sched_getcpu_syscall(void) {
2795  unsigned int cpu = 0;
2796  int retval = -1;
2797
2798#if defined(IA32)
2799  #ifndef SYS_getcpu
2800    #define SYS_getcpu 318
2801  #endif
2802  retval = syscall(SYS_getcpu, &cpu, NULL, NULL);
2803#elif defined(AMD64)
2804// Unfortunately we have to bring all these macros here from vsyscall.h
2805// to be able to compile on old linuxes.
2806  #define __NR_vgetcpu 2
2807  #define VSYSCALL_START (-10UL << 20)
2808  #define VSYSCALL_SIZE 1024
2809  #define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
2810  typedef long (*vgetcpu_t)(unsigned int *cpu, unsigned int *node, unsigned long *tcache);
2811  vgetcpu_t vgetcpu = (vgetcpu_t)VSYSCALL_ADDR(__NR_vgetcpu);
2812  retval = vgetcpu(&cpu, NULL, NULL);
2813#endif
2814
2815  return (retval == -1) ? retval : cpu;
2816}
2817
2818// Something to do with the numa-aware allocator needs these symbols
2819extern "C" JNIEXPORT void numa_warn(int number, char *where, ...) { }
2820extern "C" JNIEXPORT void numa_error(char *where) { }
2821
2822// Handle request to load libnuma symbol version 1.1 (API v1). If it fails
2823// load symbol from base version instead.
2824void* os::Linux::libnuma_dlsym(void* handle, const char *name) {
2825  void *f = dlvsym(handle, name, "libnuma_1.1");
2826  if (f == NULL) {
2827    f = dlsym(handle, name);
2828  }
2829  return f;
2830}
2831
2832// Handle request to load libnuma symbol version 1.2 (API v2) only.
2833// Return NULL if the symbol is not defined in this particular version.
2834void* os::Linux::libnuma_v2_dlsym(void* handle, const char* name) {
2835  return dlvsym(handle, name, "libnuma_1.2");
2836}
2837
2838bool os::Linux::libnuma_init() {
2839  // sched_getcpu() should be in libc.
2840  set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
2841                                  dlsym(RTLD_DEFAULT, "sched_getcpu")));
2842
2843  // If it's not, try a direct syscall.
2844  if (sched_getcpu() == -1) {
2845    set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
2846                                    (void*)&sched_getcpu_syscall));
2847  }
2848
2849  if (sched_getcpu() != -1) { // Does it work?
2850    void *handle = dlopen("libnuma.so.1", RTLD_LAZY);
2851    if (handle != NULL) {
2852      set_numa_node_to_cpus(CAST_TO_FN_PTR(numa_node_to_cpus_func_t,
2853                                           libnuma_dlsym(handle, "numa_node_to_cpus")));
2854      set_numa_max_node(CAST_TO_FN_PTR(numa_max_node_func_t,
2855                                       libnuma_dlsym(handle, "numa_max_node")));
2856      set_numa_num_configured_nodes(CAST_TO_FN_PTR(numa_num_configured_nodes_func_t,
2857                                                   libnuma_dlsym(handle, "numa_num_configured_nodes")));
2858      set_numa_available(CAST_TO_FN_PTR(numa_available_func_t,
2859                                        libnuma_dlsym(handle, "numa_available")));
2860      set_numa_tonode_memory(CAST_TO_FN_PTR(numa_tonode_memory_func_t,
2861                                            libnuma_dlsym(handle, "numa_tonode_memory")));
2862      set_numa_interleave_memory(CAST_TO_FN_PTR(numa_interleave_memory_func_t,
2863                                                libnuma_dlsym(handle, "numa_interleave_memory")));
2864      set_numa_interleave_memory_v2(CAST_TO_FN_PTR(numa_interleave_memory_v2_func_t,
2865                                                libnuma_v2_dlsym(handle, "numa_interleave_memory")));
2866      set_numa_set_bind_policy(CAST_TO_FN_PTR(numa_set_bind_policy_func_t,
2867                                              libnuma_dlsym(handle, "numa_set_bind_policy")));
2868      set_numa_bitmask_isbitset(CAST_TO_FN_PTR(numa_bitmask_isbitset_func_t,
2869                                               libnuma_dlsym(handle, "numa_bitmask_isbitset")));
2870      set_numa_distance(CAST_TO_FN_PTR(numa_distance_func_t,
2871                                       libnuma_dlsym(handle, "numa_distance")));
2872
2873      if (numa_available() != -1) {
2874        set_numa_all_nodes((unsigned long*)libnuma_dlsym(handle, "numa_all_nodes"));
2875        set_numa_all_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_all_nodes_ptr"));
2876        set_numa_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_nodes_ptr"));
2877        // Create an index -> node mapping, since nodes are not always consecutive
2878        _nindex_to_node = new (ResourceObj::C_HEAP, mtInternal) GrowableArray<int>(0, true);
2879        rebuild_nindex_to_node_map();
2880        // Create a cpu -> node mapping
2881        _cpu_to_node = new (ResourceObj::C_HEAP, mtInternal) GrowableArray<int>(0, true);
2882        rebuild_cpu_to_node_map();
2883        return true;
2884      }
2885    }
2886  }
2887  return false;
2888}
2889
2890size_t os::Linux::default_guard_size(os::ThreadType thr_type) {
2891  // Creating guard page is very expensive. Java thread has HotSpot
2892  // guard pages, only enable glibc guard page for non-Java threads.
2893  // (Remember: compiler thread is a Java thread, too!)
2894  return ((thr_type == java_thread || thr_type == compiler_thread) ? 0 : page_size());
2895}
2896
2897void os::Linux::rebuild_nindex_to_node_map() {
2898  int highest_node_number = Linux::numa_max_node();
2899
2900  nindex_to_node()->clear();
2901  for (int node = 0; node <= highest_node_number; node++) {
2902    if (Linux::isnode_in_existing_nodes(node)) {
2903      nindex_to_node()->append(node);
2904    }
2905  }
2906}
2907
2908// rebuild_cpu_to_node_map() constructs a table mapping cpud id to node id.
2909// The table is later used in get_node_by_cpu().
2910void os::Linux::rebuild_cpu_to_node_map() {
2911  const size_t NCPUS = 32768; // Since the buffer size computation is very obscure
2912                              // in libnuma (possible values are starting from 16,
2913                              // and continuing up with every other power of 2, but less
2914                              // than the maximum number of CPUs supported by kernel), and
2915                              // is a subject to change (in libnuma version 2 the requirements
2916                              // are more reasonable) we'll just hardcode the number they use
2917                              // in the library.
2918  const size_t BitsPerCLong = sizeof(long) * CHAR_BIT;
2919
2920  size_t cpu_num = processor_count();
2921  size_t cpu_map_size = NCPUS / BitsPerCLong;
2922  size_t cpu_map_valid_size =
2923    MIN2((cpu_num + BitsPerCLong - 1) / BitsPerCLong, cpu_map_size);
2924
2925  cpu_to_node()->clear();
2926  cpu_to_node()->at_grow(cpu_num - 1);
2927
2928  size_t node_num = get_existing_num_nodes();
2929
2930  int distance = 0;
2931  int closest_distance = INT_MAX;
2932  int closest_node = 0;
2933  unsigned long *cpu_map = NEW_C_HEAP_ARRAY(unsigned long, cpu_map_size, mtInternal);
2934  for (size_t i = 0; i < node_num; i++) {
2935    // Check if node is configured (not a memory-less node). If it is not, find
2936    // the closest configured node.
2937    if (!isnode_in_configured_nodes(nindex_to_node()->at(i))) {
2938      closest_distance = INT_MAX;
2939      // Check distance from all remaining nodes in the system. Ignore distance
2940      // from itself and from another non-configured node.
2941      for (size_t m = 0; m < node_num; m++) {
2942        if (m != i && isnode_in_configured_nodes(nindex_to_node()->at(m))) {
2943          distance = numa_distance(nindex_to_node()->at(i), nindex_to_node()->at(m));
2944          // If a closest node is found, update. There is always at least one
2945          // configured node in the system so there is always at least one node
2946          // close.
2947          if (distance != 0 && distance < closest_distance) {
2948            closest_distance = distance;
2949            closest_node = nindex_to_node()->at(m);
2950          }
2951        }
2952      }
2953     } else {
2954       // Current node is already a configured node.
2955       closest_node = nindex_to_node()->at(i);
2956     }
2957
2958    // Get cpus from the original node and map them to the closest node. If node
2959    // is a configured node (not a memory-less node), then original node and
2960    // closest node are the same.
2961    if (numa_node_to_cpus(nindex_to_node()->at(i), cpu_map, cpu_map_size * sizeof(unsigned long)) != -1) {
2962      for (size_t j = 0; j < cpu_map_valid_size; j++) {
2963        if (cpu_map[j] != 0) {
2964          for (size_t k = 0; k < BitsPerCLong; k++) {
2965            if (cpu_map[j] & (1UL << k)) {
2966              cpu_to_node()->at_put(j * BitsPerCLong + k, closest_node);
2967            }
2968          }
2969        }
2970      }
2971    }
2972  }
2973  FREE_C_HEAP_ARRAY(unsigned long, cpu_map);
2974}
2975
2976int os::Linux::get_node_by_cpu(int cpu_id) {
2977  if (cpu_to_node() != NULL && cpu_id >= 0 && cpu_id < cpu_to_node()->length()) {
2978    return cpu_to_node()->at(cpu_id);
2979  }
2980  return -1;
2981}
2982
2983GrowableArray<int>* os::Linux::_cpu_to_node;
2984GrowableArray<int>* os::Linux::_nindex_to_node;
2985os::Linux::sched_getcpu_func_t os::Linux::_sched_getcpu;
2986os::Linux::numa_node_to_cpus_func_t os::Linux::_numa_node_to_cpus;
2987os::Linux::numa_max_node_func_t os::Linux::_numa_max_node;
2988os::Linux::numa_num_configured_nodes_func_t os::Linux::_numa_num_configured_nodes;
2989os::Linux::numa_available_func_t os::Linux::_numa_available;
2990os::Linux::numa_tonode_memory_func_t os::Linux::_numa_tonode_memory;
2991os::Linux::numa_interleave_memory_func_t os::Linux::_numa_interleave_memory;
2992os::Linux::numa_interleave_memory_v2_func_t os::Linux::_numa_interleave_memory_v2;
2993os::Linux::numa_set_bind_policy_func_t os::Linux::_numa_set_bind_policy;
2994os::Linux::numa_bitmask_isbitset_func_t os::Linux::_numa_bitmask_isbitset;
2995os::Linux::numa_distance_func_t os::Linux::_numa_distance;
2996unsigned long* os::Linux::_numa_all_nodes;
2997struct bitmask* os::Linux::_numa_all_nodes_ptr;
2998struct bitmask* os::Linux::_numa_nodes_ptr;
2999
3000bool os::pd_uncommit_memory(char* addr, size_t size) {
3001  uintptr_t res = (uintptr_t) ::mmap(addr, size, PROT_NONE,
3002                                     MAP_PRIVATE|MAP_FIXED|MAP_NORESERVE|MAP_ANONYMOUS, -1, 0);
3003  return res  != (uintptr_t) MAP_FAILED;
3004}
3005
3006static address get_stack_commited_bottom(address bottom, size_t size) {
3007  address nbot = bottom;
3008  address ntop = bottom + size;
3009
3010  size_t page_sz = os::vm_page_size();
3011  unsigned pages = size / page_sz;
3012
3013  unsigned char vec[1];
3014  unsigned imin = 1, imax = pages + 1, imid;
3015  int mincore_return_value = 0;
3016
3017  assert(imin <= imax, "Unexpected page size");
3018
3019  while (imin < imax) {
3020    imid = (imax + imin) / 2;
3021    nbot = ntop - (imid * page_sz);
3022
3023    // Use a trick with mincore to check whether the page is mapped or not.
3024    // mincore sets vec to 1 if page resides in memory and to 0 if page
3025    // is swapped output but if page we are asking for is unmapped
3026    // it returns -1,ENOMEM
3027    mincore_return_value = mincore(nbot, page_sz, vec);
3028
3029    if (mincore_return_value == -1) {
3030      // Page is not mapped go up
3031      // to find first mapped page
3032      if (errno != EAGAIN) {
3033        assert(errno == ENOMEM, "Unexpected mincore errno");
3034        imax = imid;
3035      }
3036    } else {
3037      // Page is mapped go down
3038      // to find first not mapped page
3039      imin = imid + 1;
3040    }
3041  }
3042
3043  nbot = nbot + page_sz;
3044
3045  // Adjust stack bottom one page up if last checked page is not mapped
3046  if (mincore_return_value == -1) {
3047    nbot = nbot + page_sz;
3048  }
3049
3050  return nbot;
3051}
3052
3053
3054// Linux uses a growable mapping for the stack, and if the mapping for
3055// the stack guard pages is not removed when we detach a thread the
3056// stack cannot grow beyond the pages where the stack guard was
3057// mapped.  If at some point later in the process the stack expands to
3058// that point, the Linux kernel cannot expand the stack any further
3059// because the guard pages are in the way, and a segfault occurs.
3060//
3061// However, it's essential not to split the stack region by unmapping
3062// a region (leaving a hole) that's already part of the stack mapping,
3063// so if the stack mapping has already grown beyond the guard pages at
3064// the time we create them, we have to truncate the stack mapping.
3065// So, we need to know the extent of the stack mapping when
3066// create_stack_guard_pages() is called.
3067
3068// We only need this for stacks that are growable: at the time of
3069// writing thread stacks don't use growable mappings (i.e. those
3070// creeated with MAP_GROWSDOWN), and aren't marked "[stack]", so this
3071// only applies to the main thread.
3072
3073// If the (growable) stack mapping already extends beyond the point
3074// where we're going to put our guard pages, truncate the mapping at
3075// that point by munmap()ping it.  This ensures that when we later
3076// munmap() the guard pages we don't leave a hole in the stack
3077// mapping. This only affects the main/initial thread
3078
3079bool os::pd_create_stack_guard_pages(char* addr, size_t size) {
3080  if (os::Linux::is_initial_thread()) {
3081    // As we manually grow stack up to bottom inside create_attached_thread(),
3082    // it's likely that os::Linux::initial_thread_stack_bottom is mapped and
3083    // we don't need to do anything special.
3084    // Check it first, before calling heavy function.
3085    uintptr_t stack_extent = (uintptr_t) os::Linux::initial_thread_stack_bottom();
3086    unsigned char vec[1];
3087
3088    if (mincore((address)stack_extent, os::vm_page_size(), vec) == -1) {
3089      // Fallback to slow path on all errors, including EAGAIN
3090      stack_extent = (uintptr_t) get_stack_commited_bottom(
3091                                                           os::Linux::initial_thread_stack_bottom(),
3092                                                           (size_t)addr - stack_extent);
3093    }
3094
3095    if (stack_extent < (uintptr_t)addr) {
3096      ::munmap((void*)stack_extent, (uintptr_t)(addr - stack_extent));
3097    }
3098  }
3099
3100  return os::commit_memory(addr, size, !ExecMem);
3101}
3102
3103// If this is a growable mapping, remove the guard pages entirely by
3104// munmap()ping them.  If not, just call uncommit_memory(). This only
3105// affects the main/initial thread, but guard against future OS changes
3106// It's safe to always unmap guard pages for initial thread because we
3107// always place it right after end of the mapped region
3108
3109bool os::remove_stack_guard_pages(char* addr, size_t size) {
3110  uintptr_t stack_extent, stack_base;
3111
3112  if (os::Linux::is_initial_thread()) {
3113    return ::munmap(addr, size) == 0;
3114  }
3115
3116  return os::uncommit_memory(addr, size);
3117}
3118
3119// If 'fixed' is true, anon_mmap() will attempt to reserve anonymous memory
3120// at 'requested_addr'. If there are existing memory mappings at the same
3121// location, however, they will be overwritten. If 'fixed' is false,
3122// 'requested_addr' is only treated as a hint, the return value may or
3123// may not start from the requested address. Unlike Linux mmap(), this
3124// function returns NULL to indicate failure.
3125static char* anon_mmap(char* requested_addr, size_t bytes, bool fixed) {
3126  char * addr;
3127  int flags;
3128
3129  flags = MAP_PRIVATE | MAP_NORESERVE | MAP_ANONYMOUS;
3130  if (fixed) {
3131    assert((uintptr_t)requested_addr % os::Linux::page_size() == 0, "unaligned address");
3132    flags |= MAP_FIXED;
3133  }
3134
3135  // Map reserved/uncommitted pages PROT_NONE so we fail early if we
3136  // touch an uncommitted page. Otherwise, the read/write might
3137  // succeed if we have enough swap space to back the physical page.
3138  addr = (char*)::mmap(requested_addr, bytes, PROT_NONE,
3139                       flags, -1, 0);
3140
3141  return addr == MAP_FAILED ? NULL : addr;
3142}
3143
3144// Allocate (using mmap, NO_RESERVE, with small pages) at either a given request address
3145//   (req_addr != NULL) or with a given alignment.
3146//  - bytes shall be a multiple of alignment.
3147//  - req_addr can be NULL. If not NULL, it must be a multiple of alignment.
3148//  - alignment sets the alignment at which memory shall be allocated.
3149//     It must be a multiple of allocation granularity.
3150// Returns address of memory or NULL. If req_addr was not NULL, will only return
3151//  req_addr or NULL.
3152static char* anon_mmap_aligned(size_t bytes, size_t alignment, char* req_addr) {
3153
3154  size_t extra_size = bytes;
3155  if (req_addr == NULL && alignment > 0) {
3156    extra_size += alignment;
3157  }
3158
3159  char* start = (char*) ::mmap(req_addr, extra_size, PROT_NONE,
3160    MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE,
3161    -1, 0);
3162  if (start == MAP_FAILED) {
3163    start = NULL;
3164  } else {
3165    if (req_addr != NULL) {
3166      if (start != req_addr) {
3167        ::munmap(start, extra_size);
3168        start = NULL;
3169      }
3170    } else {
3171      char* const start_aligned = (char*) align_ptr_up(start, alignment);
3172      char* const end_aligned = start_aligned + bytes;
3173      char* const end = start + extra_size;
3174      if (start_aligned > start) {
3175        ::munmap(start, start_aligned - start);
3176      }
3177      if (end_aligned < end) {
3178        ::munmap(end_aligned, end - end_aligned);
3179      }
3180      start = start_aligned;
3181    }
3182  }
3183  return start;
3184}
3185
3186static int anon_munmap(char * addr, size_t size) {
3187  return ::munmap(addr, size) == 0;
3188}
3189
3190char* os::pd_reserve_memory(size_t bytes, char* requested_addr,
3191                            size_t alignment_hint) {
3192  return anon_mmap(requested_addr, bytes, (requested_addr != NULL));
3193}
3194
3195bool os::pd_release_memory(char* addr, size_t size) {
3196  return anon_munmap(addr, size);
3197}
3198
3199static bool linux_mprotect(char* addr, size_t size, int prot) {
3200  // Linux wants the mprotect address argument to be page aligned.
3201  char* bottom = (char*)align_size_down((intptr_t)addr, os::Linux::page_size());
3202
3203  // According to SUSv3, mprotect() should only be used with mappings
3204  // established by mmap(), and mmap() always maps whole pages. Unaligned
3205  // 'addr' likely indicates problem in the VM (e.g. trying to change
3206  // protection of malloc'ed or statically allocated memory). Check the
3207  // caller if you hit this assert.
3208  assert(addr == bottom, "sanity check");
3209
3210  size = align_size_up(pointer_delta(addr, bottom, 1) + size, os::Linux::page_size());
3211  return ::mprotect(bottom, size, prot) == 0;
3212}
3213
3214// Set protections specified
3215bool os::protect_memory(char* addr, size_t bytes, ProtType prot,
3216                        bool is_committed) {
3217  unsigned int p = 0;
3218  switch (prot) {
3219  case MEM_PROT_NONE: p = PROT_NONE; break;
3220  case MEM_PROT_READ: p = PROT_READ; break;
3221  case MEM_PROT_RW:   p = PROT_READ|PROT_WRITE; break;
3222  case MEM_PROT_RWX:  p = PROT_READ|PROT_WRITE|PROT_EXEC; break;
3223  default:
3224    ShouldNotReachHere();
3225  }
3226  // is_committed is unused.
3227  return linux_mprotect(addr, bytes, p);
3228}
3229
3230bool os::guard_memory(char* addr, size_t size) {
3231  return linux_mprotect(addr, size, PROT_NONE);
3232}
3233
3234bool os::unguard_memory(char* addr, size_t size) {
3235  return linux_mprotect(addr, size, PROT_READ|PROT_WRITE);
3236}
3237
3238bool os::Linux::transparent_huge_pages_sanity_check(bool warn,
3239                                                    size_t page_size) {
3240  bool result = false;
3241  void *p = mmap(NULL, page_size * 2, PROT_READ|PROT_WRITE,
3242                 MAP_ANONYMOUS|MAP_PRIVATE,
3243                 -1, 0);
3244  if (p != MAP_FAILED) {
3245    void *aligned_p = align_ptr_up(p, page_size);
3246
3247    result = madvise(aligned_p, page_size, MADV_HUGEPAGE) == 0;
3248
3249    munmap(p, page_size * 2);
3250  }
3251
3252  if (warn && !result) {
3253    warning("TransparentHugePages is not supported by the operating system.");
3254  }
3255
3256  return result;
3257}
3258
3259bool os::Linux::hugetlbfs_sanity_check(bool warn, size_t page_size) {
3260  bool result = false;
3261  void *p = mmap(NULL, page_size, PROT_READ|PROT_WRITE,
3262                 MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB,
3263                 -1, 0);
3264
3265  if (p != MAP_FAILED) {
3266    // We don't know if this really is a huge page or not.
3267    FILE *fp = fopen("/proc/self/maps", "r");
3268    if (fp) {
3269      while (!feof(fp)) {
3270        char chars[257];
3271        long x = 0;
3272        if (fgets(chars, sizeof(chars), fp)) {
3273          if (sscanf(chars, "%lx-%*x", &x) == 1
3274              && x == (long)p) {
3275            if (strstr (chars, "hugepage")) {
3276              result = true;
3277              break;
3278            }
3279          }
3280        }
3281      }
3282      fclose(fp);
3283    }
3284    munmap(p, page_size);
3285  }
3286
3287  if (warn && !result) {
3288    warning("HugeTLBFS is not supported by the operating system.");
3289  }
3290
3291  return result;
3292}
3293
3294// Set the coredump_filter bits to include largepages in core dump (bit 6)
3295//
3296// From the coredump_filter documentation:
3297//
3298// - (bit 0) anonymous private memory
3299// - (bit 1) anonymous shared memory
3300// - (bit 2) file-backed private memory
3301// - (bit 3) file-backed shared memory
3302// - (bit 4) ELF header pages in file-backed private memory areas (it is
3303//           effective only if the bit 2 is cleared)
3304// - (bit 5) hugetlb private memory
3305// - (bit 6) hugetlb shared memory
3306//
3307static void set_coredump_filter(void) {
3308  FILE *f;
3309  long cdm;
3310
3311  if ((f = fopen("/proc/self/coredump_filter", "r+")) == NULL) {
3312    return;
3313  }
3314
3315  if (fscanf(f, "%lx", &cdm) != 1) {
3316    fclose(f);
3317    return;
3318  }
3319
3320  rewind(f);
3321
3322  if ((cdm & LARGEPAGES_BIT) == 0) {
3323    cdm |= LARGEPAGES_BIT;
3324    fprintf(f, "%#lx", cdm);
3325  }
3326
3327  fclose(f);
3328}
3329
3330// Large page support
3331
3332static size_t _large_page_size = 0;
3333
3334size_t os::Linux::find_large_page_size() {
3335  size_t large_page_size = 0;
3336
3337  // large_page_size on Linux is used to round up heap size. x86 uses either
3338  // 2M or 4M page, depending on whether PAE (Physical Address Extensions)
3339  // mode is enabled. AMD64/EM64T uses 2M page in 64bit mode. IA64 can use
3340  // page as large as 256M.
3341  //
3342  // Here we try to figure out page size by parsing /proc/meminfo and looking
3343  // for a line with the following format:
3344  //    Hugepagesize:     2048 kB
3345  //
3346  // If we can't determine the value (e.g. /proc is not mounted, or the text
3347  // format has been changed), we'll use the largest page size supported by
3348  // the processor.
3349
3350#ifndef ZERO
3351  large_page_size =
3352    AARCH64_ONLY(2 * M)
3353    AMD64_ONLY(2 * M)
3354    ARM32_ONLY(2 * M)
3355    IA32_ONLY(4 * M)
3356    IA64_ONLY(256 * M)
3357    PPC_ONLY(4 * M)
3358    S390_ONLY(1 * M)
3359    SPARC_ONLY(4 * M);
3360#endif // ZERO
3361
3362  FILE *fp = fopen("/proc/meminfo", "r");
3363  if (fp) {
3364    while (!feof(fp)) {
3365      int x = 0;
3366      char buf[16];
3367      if (fscanf(fp, "Hugepagesize: %d", &x) == 1) {
3368        if (x && fgets(buf, sizeof(buf), fp) && strcmp(buf, " kB\n") == 0) {
3369          large_page_size = x * K;
3370          break;
3371        }
3372      } else {
3373        // skip to next line
3374        for (;;) {
3375          int ch = fgetc(fp);
3376          if (ch == EOF || ch == (int)'\n') break;
3377        }
3378      }
3379    }
3380    fclose(fp);
3381  }
3382
3383  if (!FLAG_IS_DEFAULT(LargePageSizeInBytes) && LargePageSizeInBytes != large_page_size) {
3384    warning("Setting LargePageSizeInBytes has no effect on this OS. Large page size is "
3385            SIZE_FORMAT "%s.", byte_size_in_proper_unit(large_page_size),
3386            proper_unit_for_byte_size(large_page_size));
3387  }
3388
3389  return large_page_size;
3390}
3391
3392size_t os::Linux::setup_large_page_size() {
3393  _large_page_size = Linux::find_large_page_size();
3394  const size_t default_page_size = (size_t)Linux::page_size();
3395  if (_large_page_size > default_page_size) {
3396    _page_sizes[0] = _large_page_size;
3397    _page_sizes[1] = default_page_size;
3398    _page_sizes[2] = 0;
3399  }
3400
3401  return _large_page_size;
3402}
3403
3404bool os::Linux::setup_large_page_type(size_t page_size) {
3405  if (FLAG_IS_DEFAULT(UseHugeTLBFS) &&
3406      FLAG_IS_DEFAULT(UseSHM) &&
3407      FLAG_IS_DEFAULT(UseTransparentHugePages)) {
3408
3409    // The type of large pages has not been specified by the user.
3410
3411    // Try UseHugeTLBFS and then UseSHM.
3412    UseHugeTLBFS = UseSHM = true;
3413
3414    // Don't try UseTransparentHugePages since there are known
3415    // performance issues with it turned on. This might change in the future.
3416    UseTransparentHugePages = false;
3417  }
3418
3419  if (UseTransparentHugePages) {
3420    bool warn_on_failure = !FLAG_IS_DEFAULT(UseTransparentHugePages);
3421    if (transparent_huge_pages_sanity_check(warn_on_failure, page_size)) {
3422      UseHugeTLBFS = false;
3423      UseSHM = false;
3424      return true;
3425    }
3426    UseTransparentHugePages = false;
3427  }
3428
3429  if (UseHugeTLBFS) {
3430    bool warn_on_failure = !FLAG_IS_DEFAULT(UseHugeTLBFS);
3431    if (hugetlbfs_sanity_check(warn_on_failure, page_size)) {
3432      UseSHM = false;
3433      return true;
3434    }
3435    UseHugeTLBFS = false;
3436  }
3437
3438  return UseSHM;
3439}
3440
3441void os::large_page_init() {
3442  if (!UseLargePages &&
3443      !UseTransparentHugePages &&
3444      !UseHugeTLBFS &&
3445      !UseSHM) {
3446    // Not using large pages.
3447    return;
3448  }
3449
3450  if (!FLAG_IS_DEFAULT(UseLargePages) && !UseLargePages) {
3451    // The user explicitly turned off large pages.
3452    // Ignore the rest of the large pages flags.
3453    UseTransparentHugePages = false;
3454    UseHugeTLBFS = false;
3455    UseSHM = false;
3456    return;
3457  }
3458
3459  size_t large_page_size = Linux::setup_large_page_size();
3460  UseLargePages          = Linux::setup_large_page_type(large_page_size);
3461
3462  set_coredump_filter();
3463}
3464
3465#ifndef SHM_HUGETLB
3466  #define SHM_HUGETLB 04000
3467#endif
3468
3469#define shm_warning_format(format, ...)              \
3470  do {                                               \
3471    if (UseLargePages &&                             \
3472        (!FLAG_IS_DEFAULT(UseLargePages) ||          \
3473         !FLAG_IS_DEFAULT(UseSHM) ||                 \
3474         !FLAG_IS_DEFAULT(LargePageSizeInBytes))) {  \
3475      warning(format, __VA_ARGS__);                  \
3476    }                                                \
3477  } while (0)
3478
3479#define shm_warning(str) shm_warning_format("%s", str)
3480
3481#define shm_warning_with_errno(str)                \
3482  do {                                             \
3483    int err = errno;                               \
3484    shm_warning_format(str " (error = %d)", err);  \
3485  } while (0)
3486
3487static char* shmat_with_alignment(int shmid, size_t bytes, size_t alignment) {
3488  assert(is_size_aligned(bytes, alignment), "Must be divisible by the alignment");
3489
3490  if (!is_size_aligned(alignment, SHMLBA)) {
3491    assert(false, "Code below assumes that alignment is at least SHMLBA aligned");
3492    return NULL;
3493  }
3494
3495  // To ensure that we get 'alignment' aligned memory from shmat,
3496  // we pre-reserve aligned virtual memory and then attach to that.
3497
3498  char* pre_reserved_addr = anon_mmap_aligned(bytes, alignment, NULL);
3499  if (pre_reserved_addr == NULL) {
3500    // Couldn't pre-reserve aligned memory.
3501    shm_warning("Failed to pre-reserve aligned memory for shmat.");
3502    return NULL;
3503  }
3504
3505  // SHM_REMAP is needed to allow shmat to map over an existing mapping.
3506  char* addr = (char*)shmat(shmid, pre_reserved_addr, SHM_REMAP);
3507
3508  if ((intptr_t)addr == -1) {
3509    int err = errno;
3510    shm_warning_with_errno("Failed to attach shared memory.");
3511
3512    assert(err != EACCES, "Unexpected error");
3513    assert(err != EIDRM,  "Unexpected error");
3514    assert(err != EINVAL, "Unexpected error");
3515
3516    // Since we don't know if the kernel unmapped the pre-reserved memory area
3517    // we can't unmap it, since that would potentially unmap memory that was
3518    // mapped from other threads.
3519    return NULL;
3520  }
3521
3522  return addr;
3523}
3524
3525static char* shmat_at_address(int shmid, char* req_addr) {
3526  if (!is_ptr_aligned(req_addr, SHMLBA)) {
3527    assert(false, "Requested address needs to be SHMLBA aligned");
3528    return NULL;
3529  }
3530
3531  char* addr = (char*)shmat(shmid, req_addr, 0);
3532
3533  if ((intptr_t)addr == -1) {
3534    shm_warning_with_errno("Failed to attach shared memory.");
3535    return NULL;
3536  }
3537
3538  return addr;
3539}
3540
3541static char* shmat_large_pages(int shmid, size_t bytes, size_t alignment, char* req_addr) {
3542  // If a req_addr has been provided, we assume that the caller has already aligned the address.
3543  if (req_addr != NULL) {
3544    assert(is_ptr_aligned(req_addr, os::large_page_size()), "Must be divisible by the large page size");
3545    assert(is_ptr_aligned(req_addr, alignment), "Must be divisible by given alignment");
3546    return shmat_at_address(shmid, req_addr);
3547  }
3548
3549  // Since shmid has been setup with SHM_HUGETLB, shmat will automatically
3550  // return large page size aligned memory addresses when req_addr == NULL.
3551  // However, if the alignment is larger than the large page size, we have
3552  // to manually ensure that the memory returned is 'alignment' aligned.
3553  if (alignment > os::large_page_size()) {
3554    assert(is_size_aligned(alignment, os::large_page_size()), "Must be divisible by the large page size");
3555    return shmat_with_alignment(shmid, bytes, alignment);
3556  } else {
3557    return shmat_at_address(shmid, NULL);
3558  }
3559}
3560
3561char* os::Linux::reserve_memory_special_shm(size_t bytes, size_t alignment,
3562                                            char* req_addr, bool exec) {
3563  // "exec" is passed in but not used.  Creating the shared image for
3564  // the code cache doesn't have an SHM_X executable permission to check.
3565  assert(UseLargePages && UseSHM, "only for SHM large pages");
3566  assert(is_ptr_aligned(req_addr, os::large_page_size()), "Unaligned address");
3567  assert(is_ptr_aligned(req_addr, alignment), "Unaligned address");
3568
3569  if (!is_size_aligned(bytes, os::large_page_size())) {
3570    return NULL; // Fallback to small pages.
3571  }
3572
3573  // Create a large shared memory region to attach to based on size.
3574  // Currently, size is the total size of the heap.
3575  int shmid = shmget(IPC_PRIVATE, bytes, SHM_HUGETLB|IPC_CREAT|SHM_R|SHM_W);
3576  if (shmid == -1) {
3577    // Possible reasons for shmget failure:
3578    // 1. shmmax is too small for Java heap.
3579    //    > check shmmax value: cat /proc/sys/kernel/shmmax
3580    //    > increase shmmax value: echo "0xffffffff" > /proc/sys/kernel/shmmax
3581    // 2. not enough large page memory.
3582    //    > check available large pages: cat /proc/meminfo
3583    //    > increase amount of large pages:
3584    //          echo new_value > /proc/sys/vm/nr_hugepages
3585    //      Note 1: different Linux may use different name for this property,
3586    //            e.g. on Redhat AS-3 it is "hugetlb_pool".
3587    //      Note 2: it's possible there's enough physical memory available but
3588    //            they are so fragmented after a long run that they can't
3589    //            coalesce into large pages. Try to reserve large pages when
3590    //            the system is still "fresh".
3591    shm_warning_with_errno("Failed to reserve shared memory.");
3592    return NULL;
3593  }
3594
3595  // Attach to the region.
3596  char* addr = shmat_large_pages(shmid, bytes, alignment, req_addr);
3597
3598  // Remove shmid. If shmat() is successful, the actual shared memory segment
3599  // will be deleted when it's detached by shmdt() or when the process
3600  // terminates. If shmat() is not successful this will remove the shared
3601  // segment immediately.
3602  shmctl(shmid, IPC_RMID, NULL);
3603
3604  return addr;
3605}
3606
3607static void warn_on_large_pages_failure(char* req_addr, size_t bytes,
3608                                        int error) {
3609  assert(error == ENOMEM, "Only expect to fail if no memory is available");
3610
3611  bool warn_on_failure = UseLargePages &&
3612      (!FLAG_IS_DEFAULT(UseLargePages) ||
3613       !FLAG_IS_DEFAULT(UseHugeTLBFS) ||
3614       !FLAG_IS_DEFAULT(LargePageSizeInBytes));
3615
3616  if (warn_on_failure) {
3617    char msg[128];
3618    jio_snprintf(msg, sizeof(msg), "Failed to reserve large pages memory req_addr: "
3619                 PTR_FORMAT " bytes: " SIZE_FORMAT " (errno = %d).", req_addr, bytes, error);
3620    warning("%s", msg);
3621  }
3622}
3623
3624char* os::Linux::reserve_memory_special_huge_tlbfs_only(size_t bytes,
3625                                                        char* req_addr,
3626                                                        bool exec) {
3627  assert(UseLargePages && UseHugeTLBFS, "only for Huge TLBFS large pages");
3628  assert(is_size_aligned(bytes, os::large_page_size()), "Unaligned size");
3629  assert(is_ptr_aligned(req_addr, os::large_page_size()), "Unaligned address");
3630
3631  int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
3632  char* addr = (char*)::mmap(req_addr, bytes, prot,
3633                             MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB,
3634                             -1, 0);
3635
3636  if (addr == MAP_FAILED) {
3637    warn_on_large_pages_failure(req_addr, bytes, errno);
3638    return NULL;
3639  }
3640
3641  assert(is_ptr_aligned(addr, os::large_page_size()), "Must be");
3642
3643  return addr;
3644}
3645
3646// Reserve memory using mmap(MAP_HUGETLB).
3647//  - bytes shall be a multiple of alignment.
3648//  - req_addr can be NULL. If not NULL, it must be a multiple of alignment.
3649//  - alignment sets the alignment at which memory shall be allocated.
3650//     It must be a multiple of allocation granularity.
3651// Returns address of memory or NULL. If req_addr was not NULL, will only return
3652//  req_addr or NULL.
3653char* os::Linux::reserve_memory_special_huge_tlbfs_mixed(size_t bytes,
3654                                                         size_t alignment,
3655                                                         char* req_addr,
3656                                                         bool exec) {
3657  size_t large_page_size = os::large_page_size();
3658  assert(bytes >= large_page_size, "Shouldn't allocate large pages for small sizes");
3659
3660  assert(is_ptr_aligned(req_addr, alignment), "Must be");
3661  assert(is_size_aligned(bytes, alignment), "Must be");
3662
3663  // First reserve - but not commit - the address range in small pages.
3664  char* const start = anon_mmap_aligned(bytes, alignment, req_addr);
3665
3666  if (start == NULL) {
3667    return NULL;
3668  }
3669
3670  assert(is_ptr_aligned(start, alignment), "Must be");
3671
3672  char* end = start + bytes;
3673
3674  // Find the regions of the allocated chunk that can be promoted to large pages.
3675  char* lp_start = (char*)align_ptr_up(start, large_page_size);
3676  char* lp_end   = (char*)align_ptr_down(end, large_page_size);
3677
3678  size_t lp_bytes = lp_end - lp_start;
3679
3680  assert(is_size_aligned(lp_bytes, large_page_size), "Must be");
3681
3682  if (lp_bytes == 0) {
3683    // The mapped region doesn't even span the start and the end of a large page.
3684    // Fall back to allocate a non-special area.
3685    ::munmap(start, end - start);
3686    return NULL;
3687  }
3688
3689  int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
3690
3691  void* result;
3692
3693  // Commit small-paged leading area.
3694  if (start != lp_start) {
3695    result = ::mmap(start, lp_start - start, prot,
3696                    MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED,
3697                    -1, 0);
3698    if (result == MAP_FAILED) {
3699      ::munmap(lp_start, end - lp_start);
3700      return NULL;
3701    }
3702  }
3703
3704  // Commit large-paged area.
3705  result = ::mmap(lp_start, lp_bytes, prot,
3706                  MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED|MAP_HUGETLB,
3707                  -1, 0);
3708  if (result == MAP_FAILED) {
3709    warn_on_large_pages_failure(lp_start, lp_bytes, errno);
3710    // If the mmap above fails, the large pages region will be unmapped and we
3711    // have regions before and after with small pages. Release these regions.
3712    //
3713    // |  mapped  |  unmapped  |  mapped  |
3714    // ^          ^            ^          ^
3715    // start      lp_start     lp_end     end
3716    //
3717    ::munmap(start, lp_start - start);
3718    ::munmap(lp_end, end - lp_end);
3719    return NULL;
3720  }
3721
3722  // Commit small-paged trailing area.
3723  if (lp_end != end) {
3724    result = ::mmap(lp_end, end - lp_end, prot,
3725                    MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED,
3726                    -1, 0);
3727    if (result == MAP_FAILED) {
3728      ::munmap(start, lp_end - start);
3729      return NULL;
3730    }
3731  }
3732
3733  return start;
3734}
3735
3736char* os::Linux::reserve_memory_special_huge_tlbfs(size_t bytes,
3737                                                   size_t alignment,
3738                                                   char* req_addr,
3739                                                   bool exec) {
3740  assert(UseLargePages && UseHugeTLBFS, "only for Huge TLBFS large pages");
3741  assert(is_ptr_aligned(req_addr, alignment), "Must be");
3742  assert(is_size_aligned(alignment, os::vm_allocation_granularity()), "Must be");
3743  assert(is_power_of_2(os::large_page_size()), "Must be");
3744  assert(bytes >= os::large_page_size(), "Shouldn't allocate large pages for small sizes");
3745
3746  if (is_size_aligned(bytes, os::large_page_size()) && alignment <= os::large_page_size()) {
3747    return reserve_memory_special_huge_tlbfs_only(bytes, req_addr, exec);
3748  } else {
3749    return reserve_memory_special_huge_tlbfs_mixed(bytes, alignment, req_addr, exec);
3750  }
3751}
3752
3753char* os::reserve_memory_special(size_t bytes, size_t alignment,
3754                                 char* req_addr, bool exec) {
3755  assert(UseLargePages, "only for large pages");
3756
3757  char* addr;
3758  if (UseSHM) {
3759    addr = os::Linux::reserve_memory_special_shm(bytes, alignment, req_addr, exec);
3760  } else {
3761    assert(UseHugeTLBFS, "must be");
3762    addr = os::Linux::reserve_memory_special_huge_tlbfs(bytes, alignment, req_addr, exec);
3763  }
3764
3765  if (addr != NULL) {
3766    if (UseNUMAInterleaving) {
3767      numa_make_global(addr, bytes);
3768    }
3769
3770    // The memory is committed
3771    MemTracker::record_virtual_memory_reserve_and_commit((address)addr, bytes, CALLER_PC);
3772  }
3773
3774  return addr;
3775}
3776
3777bool os::Linux::release_memory_special_shm(char* base, size_t bytes) {
3778  // detaching the SHM segment will also delete it, see reserve_memory_special_shm()
3779  return shmdt(base) == 0;
3780}
3781
3782bool os::Linux::release_memory_special_huge_tlbfs(char* base, size_t bytes) {
3783  return pd_release_memory(base, bytes);
3784}
3785
3786bool os::release_memory_special(char* base, size_t bytes) {
3787  bool res;
3788  if (MemTracker::tracking_level() > NMT_minimal) {
3789    Tracker tkr = MemTracker::get_virtual_memory_release_tracker();
3790    res = os::Linux::release_memory_special_impl(base, bytes);
3791    if (res) {
3792      tkr.record((address)base, bytes);
3793    }
3794
3795  } else {
3796    res = os::Linux::release_memory_special_impl(base, bytes);
3797  }
3798  return res;
3799}
3800
3801bool os::Linux::release_memory_special_impl(char* base, size_t bytes) {
3802  assert(UseLargePages, "only for large pages");
3803  bool res;
3804
3805  if (UseSHM) {
3806    res = os::Linux::release_memory_special_shm(base, bytes);
3807  } else {
3808    assert(UseHugeTLBFS, "must be");
3809    res = os::Linux::release_memory_special_huge_tlbfs(base, bytes);
3810  }
3811  return res;
3812}
3813
3814size_t os::large_page_size() {
3815  return _large_page_size;
3816}
3817
3818// With SysV SHM the entire memory region must be allocated as shared
3819// memory.
3820// HugeTLBFS allows application to commit large page memory on demand.
3821// However, when committing memory with HugeTLBFS fails, the region
3822// that was supposed to be committed will lose the old reservation
3823// and allow other threads to steal that memory region. Because of this
3824// behavior we can't commit HugeTLBFS memory.
3825bool os::can_commit_large_page_memory() {
3826  return UseTransparentHugePages;
3827}
3828
3829bool os::can_execute_large_page_memory() {
3830  return UseTransparentHugePages || UseHugeTLBFS;
3831}
3832
3833// Reserve memory at an arbitrary address, only if that area is
3834// available (and not reserved for something else).
3835
3836char* os::pd_attempt_reserve_memory_at(size_t bytes, char* requested_addr) {
3837  const int max_tries = 10;
3838  char* base[max_tries];
3839  size_t size[max_tries];
3840  const size_t gap = 0x000000;
3841
3842  // Assert only that the size is a multiple of the page size, since
3843  // that's all that mmap requires, and since that's all we really know
3844  // about at this low abstraction level.  If we need higher alignment,
3845  // we can either pass an alignment to this method or verify alignment
3846  // in one of the methods further up the call chain.  See bug 5044738.
3847  assert(bytes % os::vm_page_size() == 0, "reserving unexpected size block");
3848
3849  // Repeatedly allocate blocks until the block is allocated at the
3850  // right spot.
3851
3852  // Linux mmap allows caller to pass an address as hint; give it a try first,
3853  // if kernel honors the hint then we can return immediately.
3854  char * addr = anon_mmap(requested_addr, bytes, false);
3855  if (addr == requested_addr) {
3856    return requested_addr;
3857  }
3858
3859  if (addr != NULL) {
3860    // mmap() is successful but it fails to reserve at the requested address
3861    anon_munmap(addr, bytes);
3862  }
3863
3864  int i;
3865  for (i = 0; i < max_tries; ++i) {
3866    base[i] = reserve_memory(bytes);
3867
3868    if (base[i] != NULL) {
3869      // Is this the block we wanted?
3870      if (base[i] == requested_addr) {
3871        size[i] = bytes;
3872        break;
3873      }
3874
3875      // Does this overlap the block we wanted? Give back the overlapped
3876      // parts and try again.
3877
3878      ptrdiff_t top_overlap = requested_addr + (bytes + gap) - base[i];
3879      if (top_overlap >= 0 && (size_t)top_overlap < bytes) {
3880        unmap_memory(base[i], top_overlap);
3881        base[i] += top_overlap;
3882        size[i] = bytes - top_overlap;
3883      } else {
3884        ptrdiff_t bottom_overlap = base[i] + bytes - requested_addr;
3885        if (bottom_overlap >= 0 && (size_t)bottom_overlap < bytes) {
3886          unmap_memory(requested_addr, bottom_overlap);
3887          size[i] = bytes - bottom_overlap;
3888        } else {
3889          size[i] = bytes;
3890        }
3891      }
3892    }
3893  }
3894
3895  // Give back the unused reserved pieces.
3896
3897  for (int j = 0; j < i; ++j) {
3898    if (base[j] != NULL) {
3899      unmap_memory(base[j], size[j]);
3900    }
3901  }
3902
3903  if (i < max_tries) {
3904    return requested_addr;
3905  } else {
3906    return NULL;
3907  }
3908}
3909
3910size_t os::read(int fd, void *buf, unsigned int nBytes) {
3911  return ::read(fd, buf, nBytes);
3912}
3913
3914size_t os::read_at(int fd, void *buf, unsigned int nBytes, jlong offset) {
3915  return ::pread(fd, buf, nBytes, offset);
3916}
3917
3918// Short sleep, direct OS call.
3919//
3920// Note: certain versions of Linux CFS scheduler (since 2.6.23) do not guarantee
3921// sched_yield(2) will actually give up the CPU:
3922//
3923//   * Alone on this pariticular CPU, keeps running.
3924//   * Before the introduction of "skip_buddy" with "compat_yield" disabled
3925//     (pre 2.6.39).
3926//
3927// So calling this with 0 is an alternative.
3928//
3929void os::naked_short_sleep(jlong ms) {
3930  struct timespec req;
3931
3932  assert(ms < 1000, "Un-interruptable sleep, short time use only");
3933  req.tv_sec = 0;
3934  if (ms > 0) {
3935    req.tv_nsec = (ms % 1000) * 1000000;
3936  } else {
3937    req.tv_nsec = 1;
3938  }
3939
3940  nanosleep(&req, NULL);
3941
3942  return;
3943}
3944
3945// Sleep forever; naked call to OS-specific sleep; use with CAUTION
3946void os::infinite_sleep() {
3947  while (true) {    // sleep forever ...
3948    ::sleep(100);   // ... 100 seconds at a time
3949  }
3950}
3951
3952// Used to convert frequent JVM_Yield() to nops
3953bool os::dont_yield() {
3954  return DontYieldALot;
3955}
3956
3957void os::naked_yield() {
3958  sched_yield();
3959}
3960
3961////////////////////////////////////////////////////////////////////////////////
3962// thread priority support
3963
3964// Note: Normal Linux applications are run with SCHED_OTHER policy. SCHED_OTHER
3965// only supports dynamic priority, static priority must be zero. For real-time
3966// applications, Linux supports SCHED_RR which allows static priority (1-99).
3967// However, for large multi-threaded applications, SCHED_RR is not only slower
3968// than SCHED_OTHER, but also very unstable (my volano tests hang hard 4 out
3969// of 5 runs - Sep 2005).
3970//
3971// The following code actually changes the niceness of kernel-thread/LWP. It
3972// has an assumption that setpriority() only modifies one kernel-thread/LWP,
3973// not the entire user process, and user level threads are 1:1 mapped to kernel
3974// threads. It has always been the case, but could change in the future. For
3975// this reason, the code should not be used as default (ThreadPriorityPolicy=0).
3976// It is only used when ThreadPriorityPolicy=1 and requires root privilege.
3977
3978int os::java_to_os_priority[CriticalPriority + 1] = {
3979  19,              // 0 Entry should never be used
3980
3981   4,              // 1 MinPriority
3982   3,              // 2
3983   2,              // 3
3984
3985   1,              // 4
3986   0,              // 5 NormPriority
3987  -1,              // 6
3988
3989  -2,              // 7
3990  -3,              // 8
3991  -4,              // 9 NearMaxPriority
3992
3993  -5,              // 10 MaxPriority
3994
3995  -5               // 11 CriticalPriority
3996};
3997
3998static int prio_init() {
3999  if (ThreadPriorityPolicy == 1) {
4000    // Only root can raise thread priority. Don't allow ThreadPriorityPolicy=1
4001    // if effective uid is not root. Perhaps, a more elegant way of doing
4002    // this is to test CAP_SYS_NICE capability, but that will require libcap.so
4003    if (geteuid() != 0) {
4004      if (!FLAG_IS_DEFAULT(ThreadPriorityPolicy)) {
4005        warning("-XX:ThreadPriorityPolicy requires root privilege on Linux");
4006      }
4007      ThreadPriorityPolicy = 0;
4008    }
4009  }
4010  if (UseCriticalJavaThreadPriority) {
4011    os::java_to_os_priority[MaxPriority] = os::java_to_os_priority[CriticalPriority];
4012  }
4013  return 0;
4014}
4015
4016OSReturn os::set_native_priority(Thread* thread, int newpri) {
4017  if (!UseThreadPriorities || ThreadPriorityPolicy == 0) return OS_OK;
4018
4019  int ret = setpriority(PRIO_PROCESS, thread->osthread()->thread_id(), newpri);
4020  return (ret == 0) ? OS_OK : OS_ERR;
4021}
4022
4023OSReturn os::get_native_priority(const Thread* const thread,
4024                                 int *priority_ptr) {
4025  if (!UseThreadPriorities || ThreadPriorityPolicy == 0) {
4026    *priority_ptr = java_to_os_priority[NormPriority];
4027    return OS_OK;
4028  }
4029
4030  errno = 0;
4031  *priority_ptr = getpriority(PRIO_PROCESS, thread->osthread()->thread_id());
4032  return (*priority_ptr != -1 || errno == 0 ? OS_OK : OS_ERR);
4033}
4034
4035// Hint to the underlying OS that a task switch would not be good.
4036// Void return because it's a hint and can fail.
4037void os::hint_no_preempt() {}
4038
4039////////////////////////////////////////////////////////////////////////////////
4040// suspend/resume support
4041
4042//  the low-level signal-based suspend/resume support is a remnant from the
4043//  old VM-suspension that used to be for java-suspension, safepoints etc,
4044//  within hotspot. Now there is a single use-case for this:
4045//    - calling get_thread_pc() on the VMThread by the flat-profiler task
4046//      that runs in the watcher thread.
4047//  The remaining code is greatly simplified from the more general suspension
4048//  code that used to be used.
4049//
4050//  The protocol is quite simple:
4051//  - suspend:
4052//      - sends a signal to the target thread
4053//      - polls the suspend state of the osthread using a yield loop
4054//      - target thread signal handler (SR_handler) sets suspend state
4055//        and blocks in sigsuspend until continued
4056//  - resume:
4057//      - sets target osthread state to continue
4058//      - sends signal to end the sigsuspend loop in the SR_handler
4059//
4060//  Note that the SR_lock plays no role in this suspend/resume protocol,
4061//  but is checked for NULL in SR_handler as a thread termination indicator.
4062
4063static void resume_clear_context(OSThread *osthread) {
4064  osthread->set_ucontext(NULL);
4065  osthread->set_siginfo(NULL);
4066}
4067
4068static void suspend_save_context(OSThread *osthread, siginfo_t* siginfo,
4069                                 ucontext_t* context) {
4070  osthread->set_ucontext(context);
4071  osthread->set_siginfo(siginfo);
4072}
4073
4074// Handler function invoked when a thread's execution is suspended or
4075// resumed. We have to be careful that only async-safe functions are
4076// called here (Note: most pthread functions are not async safe and
4077// should be avoided.)
4078//
4079// Note: sigwait() is a more natural fit than sigsuspend() from an
4080// interface point of view, but sigwait() prevents the signal hander
4081// from being run. libpthread would get very confused by not having
4082// its signal handlers run and prevents sigwait()'s use with the
4083// mutex granting granting signal.
4084//
4085// Currently only ever called on the VMThread and JavaThreads (PC sampling)
4086//
4087static void SR_handler(int sig, siginfo_t* siginfo, ucontext_t* context) {
4088  // Save and restore errno to avoid confusing native code with EINTR
4089  // after sigsuspend.
4090  int old_errno = errno;
4091
4092  Thread* thread = Thread::current_or_null_safe();
4093  assert(thread != NULL, "Missing current thread in SR_handler");
4094
4095  // On some systems we have seen signal delivery get "stuck" until the signal
4096  // mask is changed as part of thread termination. Check that the current thread
4097  // has not already terminated (via SR_lock()) - else the following assertion
4098  // will fail because the thread is no longer a JavaThread as the ~JavaThread
4099  // destructor has completed.
4100
4101  if (thread->SR_lock() == NULL) {
4102    return;
4103  }
4104
4105  assert(thread->is_VM_thread() || thread->is_Java_thread(), "Must be VMThread or JavaThread");
4106
4107  OSThread* osthread = thread->osthread();
4108
4109  os::SuspendResume::State current = osthread->sr.state();
4110  if (current == os::SuspendResume::SR_SUSPEND_REQUEST) {
4111    suspend_save_context(osthread, siginfo, context);
4112
4113    // attempt to switch the state, we assume we had a SUSPEND_REQUEST
4114    os::SuspendResume::State state = osthread->sr.suspended();
4115    if (state == os::SuspendResume::SR_SUSPENDED) {
4116      sigset_t suspend_set;  // signals for sigsuspend()
4117      sigemptyset(&suspend_set);
4118      // get current set of blocked signals and unblock resume signal
4119      pthread_sigmask(SIG_BLOCK, NULL, &suspend_set);
4120      sigdelset(&suspend_set, SR_signum);
4121
4122      sr_semaphore.signal();
4123      // wait here until we are resumed
4124      while (1) {
4125        sigsuspend(&suspend_set);
4126
4127        os::SuspendResume::State result = osthread->sr.running();
4128        if (result == os::SuspendResume::SR_RUNNING) {
4129          sr_semaphore.signal();
4130          break;
4131        }
4132      }
4133
4134    } else if (state == os::SuspendResume::SR_RUNNING) {
4135      // request was cancelled, continue
4136    } else {
4137      ShouldNotReachHere();
4138    }
4139
4140    resume_clear_context(osthread);
4141  } else if (current == os::SuspendResume::SR_RUNNING) {
4142    // request was cancelled, continue
4143  } else if (current == os::SuspendResume::SR_WAKEUP_REQUEST) {
4144    // ignore
4145  } else {
4146    // ignore
4147  }
4148
4149  errno = old_errno;
4150}
4151
4152static int SR_initialize() {
4153  struct sigaction act;
4154  char *s;
4155
4156  // Get signal number to use for suspend/resume
4157  if ((s = ::getenv("_JAVA_SR_SIGNUM")) != 0) {
4158    int sig = ::strtol(s, 0, 10);
4159    if (sig > MAX2(SIGSEGV, SIGBUS) &&  // See 4355769.
4160        sig < NSIG) {                   // Must be legal signal and fit into sigflags[].
4161      SR_signum = sig;
4162    } else {
4163      warning("You set _JAVA_SR_SIGNUM=%d. It must be in range [%d, %d]. Using %d instead.",
4164              sig, MAX2(SIGSEGV, SIGBUS)+1, NSIG-1, SR_signum);
4165    }
4166  }
4167
4168  assert(SR_signum > SIGSEGV && SR_signum > SIGBUS,
4169         "SR_signum must be greater than max(SIGSEGV, SIGBUS), see 4355769");
4170
4171  sigemptyset(&SR_sigset);
4172  sigaddset(&SR_sigset, SR_signum);
4173
4174  // Set up signal handler for suspend/resume
4175  act.sa_flags = SA_RESTART|SA_SIGINFO;
4176  act.sa_handler = (void (*)(int)) SR_handler;
4177
4178  // SR_signum is blocked by default.
4179  // 4528190 - We also need to block pthread restart signal (32 on all
4180  // supported Linux platforms). Note that LinuxThreads need to block
4181  // this signal for all threads to work properly. So we don't have
4182  // to use hard-coded signal number when setting up the mask.
4183  pthread_sigmask(SIG_BLOCK, NULL, &act.sa_mask);
4184
4185  if (sigaction(SR_signum, &act, 0) == -1) {
4186    return -1;
4187  }
4188
4189  // Save signal flag
4190  os::Linux::set_our_sigflags(SR_signum, act.sa_flags);
4191  return 0;
4192}
4193
4194static int sr_notify(OSThread* osthread) {
4195  int status = pthread_kill(osthread->pthread_id(), SR_signum);
4196  assert_status(status == 0, status, "pthread_kill");
4197  return status;
4198}
4199
4200// "Randomly" selected value for how long we want to spin
4201// before bailing out on suspending a thread, also how often
4202// we send a signal to a thread we want to resume
4203static const int RANDOMLY_LARGE_INTEGER = 1000000;
4204static const int RANDOMLY_LARGE_INTEGER2 = 100;
4205
4206// returns true on success and false on error - really an error is fatal
4207// but this seems the normal response to library errors
4208static bool do_suspend(OSThread* osthread) {
4209  assert(osthread->sr.is_running(), "thread should be running");
4210  assert(!sr_semaphore.trywait(), "semaphore has invalid state");
4211
4212  // mark as suspended and send signal
4213  if (osthread->sr.request_suspend() != os::SuspendResume::SR_SUSPEND_REQUEST) {
4214    // failed to switch, state wasn't running?
4215    ShouldNotReachHere();
4216    return false;
4217  }
4218
4219  if (sr_notify(osthread) != 0) {
4220    ShouldNotReachHere();
4221  }
4222
4223  // managed to send the signal and switch to SUSPEND_REQUEST, now wait for SUSPENDED
4224  while (true) {
4225    if (sr_semaphore.timedwait(0, 2 * NANOSECS_PER_MILLISEC)) {
4226      break;
4227    } else {
4228      // timeout
4229      os::SuspendResume::State cancelled = osthread->sr.cancel_suspend();
4230      if (cancelled == os::SuspendResume::SR_RUNNING) {
4231        return false;
4232      } else if (cancelled == os::SuspendResume::SR_SUSPENDED) {
4233        // make sure that we consume the signal on the semaphore as well
4234        sr_semaphore.wait();
4235        break;
4236      } else {
4237        ShouldNotReachHere();
4238        return false;
4239      }
4240    }
4241  }
4242
4243  guarantee(osthread->sr.is_suspended(), "Must be suspended");
4244  return true;
4245}
4246
4247static void do_resume(OSThread* osthread) {
4248  assert(osthread->sr.is_suspended(), "thread should be suspended");
4249  assert(!sr_semaphore.trywait(), "invalid semaphore state");
4250
4251  if (osthread->sr.request_wakeup() != os::SuspendResume::SR_WAKEUP_REQUEST) {
4252    // failed to switch to WAKEUP_REQUEST
4253    ShouldNotReachHere();
4254    return;
4255  }
4256
4257  while (true) {
4258    if (sr_notify(osthread) == 0) {
4259      if (sr_semaphore.timedwait(0, 2 * NANOSECS_PER_MILLISEC)) {
4260        if (osthread->sr.is_running()) {
4261          return;
4262        }
4263      }
4264    } else {
4265      ShouldNotReachHere();
4266    }
4267  }
4268
4269  guarantee(osthread->sr.is_running(), "Must be running!");
4270}
4271
4272///////////////////////////////////////////////////////////////////////////////////
4273// signal handling (except suspend/resume)
4274
4275// This routine may be used by user applications as a "hook" to catch signals.
4276// The user-defined signal handler must pass unrecognized signals to this
4277// routine, and if it returns true (non-zero), then the signal handler must
4278// return immediately.  If the flag "abort_if_unrecognized" is true, then this
4279// routine will never retun false (zero), but instead will execute a VM panic
4280// routine kill the process.
4281//
4282// If this routine returns false, it is OK to call it again.  This allows
4283// the user-defined signal handler to perform checks either before or after
4284// the VM performs its own checks.  Naturally, the user code would be making
4285// a serious error if it tried to handle an exception (such as a null check
4286// or breakpoint) that the VM was generating for its own correct operation.
4287//
4288// This routine may recognize any of the following kinds of signals:
4289//    SIGBUS, SIGSEGV, SIGILL, SIGFPE, SIGQUIT, SIGPIPE, SIGXFSZ, SIGUSR1.
4290// It should be consulted by handlers for any of those signals.
4291//
4292// The caller of this routine must pass in the three arguments supplied
4293// to the function referred to in the "sa_sigaction" (not the "sa_handler")
4294// field of the structure passed to sigaction().  This routine assumes that
4295// the sa_flags field passed to sigaction() includes SA_SIGINFO and SA_RESTART.
4296//
4297// Note that the VM will print warnings if it detects conflicting signal
4298// handlers, unless invoked with the option "-XX:+AllowUserSignalHandlers".
4299//
4300extern "C" JNIEXPORT int JVM_handle_linux_signal(int signo,
4301                                                 siginfo_t* siginfo,
4302                                                 void* ucontext,
4303                                                 int abort_if_unrecognized);
4304
4305void signalHandler(int sig, siginfo_t* info, void* uc) {
4306  assert(info != NULL && uc != NULL, "it must be old kernel");
4307  int orig_errno = errno;  // Preserve errno value over signal handler.
4308  JVM_handle_linux_signal(sig, info, uc, true);
4309  errno = orig_errno;
4310}
4311
4312
4313// This boolean allows users to forward their own non-matching signals
4314// to JVM_handle_linux_signal, harmlessly.
4315bool os::Linux::signal_handlers_are_installed = false;
4316
4317// For signal-chaining
4318struct sigaction sigact[NSIG];
4319uint64_t sigs = 0;
4320#if (64 < NSIG-1)
4321#error "Not all signals can be encoded in sigs. Adapt its type!"
4322#endif
4323bool os::Linux::libjsig_is_loaded = false;
4324typedef struct sigaction *(*get_signal_t)(int);
4325get_signal_t os::Linux::get_signal_action = NULL;
4326
4327struct sigaction* os::Linux::get_chained_signal_action(int sig) {
4328  struct sigaction *actp = NULL;
4329
4330  if (libjsig_is_loaded) {
4331    // Retrieve the old signal handler from libjsig
4332    actp = (*get_signal_action)(sig);
4333  }
4334  if (actp == NULL) {
4335    // Retrieve the preinstalled signal handler from jvm
4336    actp = get_preinstalled_handler(sig);
4337  }
4338
4339  return actp;
4340}
4341
4342static bool call_chained_handler(struct sigaction *actp, int sig,
4343                                 siginfo_t *siginfo, void *context) {
4344  // Call the old signal handler
4345  if (actp->sa_handler == SIG_DFL) {
4346    // It's more reasonable to let jvm treat it as an unexpected exception
4347    // instead of taking the default action.
4348    return false;
4349  } else if (actp->sa_handler != SIG_IGN) {
4350    if ((actp->sa_flags & SA_NODEFER) == 0) {
4351      // automaticlly block the signal
4352      sigaddset(&(actp->sa_mask), sig);
4353    }
4354
4355    sa_handler_t hand = NULL;
4356    sa_sigaction_t sa = NULL;
4357    bool siginfo_flag_set = (actp->sa_flags & SA_SIGINFO) != 0;
4358    // retrieve the chained handler
4359    if (siginfo_flag_set) {
4360      sa = actp->sa_sigaction;
4361    } else {
4362      hand = actp->sa_handler;
4363    }
4364
4365    if ((actp->sa_flags & SA_RESETHAND) != 0) {
4366      actp->sa_handler = SIG_DFL;
4367    }
4368
4369    // try to honor the signal mask
4370    sigset_t oset;
4371    sigemptyset(&oset);
4372    pthread_sigmask(SIG_SETMASK, &(actp->sa_mask), &oset);
4373
4374    // call into the chained handler
4375    if (siginfo_flag_set) {
4376      (*sa)(sig, siginfo, context);
4377    } else {
4378      (*hand)(sig);
4379    }
4380
4381    // restore the signal mask
4382    pthread_sigmask(SIG_SETMASK, &oset, NULL);
4383  }
4384  // Tell jvm's signal handler the signal is taken care of.
4385  return true;
4386}
4387
4388bool os::Linux::chained_handler(int sig, siginfo_t* siginfo, void* context) {
4389  bool chained = false;
4390  // signal-chaining
4391  if (UseSignalChaining) {
4392    struct sigaction *actp = get_chained_signal_action(sig);
4393    if (actp != NULL) {
4394      chained = call_chained_handler(actp, sig, siginfo, context);
4395    }
4396  }
4397  return chained;
4398}
4399
4400struct sigaction* os::Linux::get_preinstalled_handler(int sig) {
4401  if ((((uint64_t)1 << (sig-1)) & sigs) != 0) {
4402    return &sigact[sig];
4403  }
4404  return NULL;
4405}
4406
4407void os::Linux::save_preinstalled_handler(int sig, struct sigaction& oldAct) {
4408  assert(sig > 0 && sig < NSIG, "vm signal out of expected range");
4409  sigact[sig] = oldAct;
4410  sigs |= (uint64_t)1 << (sig-1);
4411}
4412
4413// for diagnostic
4414int sigflags[NSIG];
4415
4416int os::Linux::get_our_sigflags(int sig) {
4417  assert(sig > 0 && sig < NSIG, "vm signal out of expected range");
4418  return sigflags[sig];
4419}
4420
4421void os::Linux::set_our_sigflags(int sig, int flags) {
4422  assert(sig > 0 && sig < NSIG, "vm signal out of expected range");
4423  if (sig > 0 && sig < NSIG) {
4424    sigflags[sig] = flags;
4425  }
4426}
4427
4428void os::Linux::set_signal_handler(int sig, bool set_installed) {
4429  // Check for overwrite.
4430  struct sigaction oldAct;
4431  sigaction(sig, (struct sigaction*)NULL, &oldAct);
4432
4433  void* oldhand = oldAct.sa_sigaction
4434                ? CAST_FROM_FN_PTR(void*,  oldAct.sa_sigaction)
4435                : CAST_FROM_FN_PTR(void*,  oldAct.sa_handler);
4436  if (oldhand != CAST_FROM_FN_PTR(void*, SIG_DFL) &&
4437      oldhand != CAST_FROM_FN_PTR(void*, SIG_IGN) &&
4438      oldhand != CAST_FROM_FN_PTR(void*, (sa_sigaction_t)signalHandler)) {
4439    if (AllowUserSignalHandlers || !set_installed) {
4440      // Do not overwrite; user takes responsibility to forward to us.
4441      return;
4442    } else if (UseSignalChaining) {
4443      // save the old handler in jvm
4444      save_preinstalled_handler(sig, oldAct);
4445      // libjsig also interposes the sigaction() call below and saves the
4446      // old sigaction on it own.
4447    } else {
4448      fatal("Encountered unexpected pre-existing sigaction handler "
4449            "%#lx for signal %d.", (long)oldhand, sig);
4450    }
4451  }
4452
4453  struct sigaction sigAct;
4454  sigfillset(&(sigAct.sa_mask));
4455  sigAct.sa_handler = SIG_DFL;
4456  if (!set_installed) {
4457    sigAct.sa_flags = SA_SIGINFO|SA_RESTART;
4458  } else {
4459    sigAct.sa_sigaction = signalHandler;
4460    sigAct.sa_flags = SA_SIGINFO|SA_RESTART;
4461  }
4462  // Save flags, which are set by ours
4463  assert(sig > 0 && sig < NSIG, "vm signal out of expected range");
4464  sigflags[sig] = sigAct.sa_flags;
4465
4466  int ret = sigaction(sig, &sigAct, &oldAct);
4467  assert(ret == 0, "check");
4468
4469  void* oldhand2  = oldAct.sa_sigaction
4470                  ? CAST_FROM_FN_PTR(void*, oldAct.sa_sigaction)
4471                  : CAST_FROM_FN_PTR(void*, oldAct.sa_handler);
4472  assert(oldhand2 == oldhand, "no concurrent signal handler installation");
4473}
4474
4475// install signal handlers for signals that HotSpot needs to
4476// handle in order to support Java-level exception handling.
4477
4478void os::Linux::install_signal_handlers() {
4479  if (!signal_handlers_are_installed) {
4480    signal_handlers_are_installed = true;
4481
4482    // signal-chaining
4483    typedef void (*signal_setting_t)();
4484    signal_setting_t begin_signal_setting = NULL;
4485    signal_setting_t end_signal_setting = NULL;
4486    begin_signal_setting = CAST_TO_FN_PTR(signal_setting_t,
4487                                          dlsym(RTLD_DEFAULT, "JVM_begin_signal_setting"));
4488    if (begin_signal_setting != NULL) {
4489      end_signal_setting = CAST_TO_FN_PTR(signal_setting_t,
4490                                          dlsym(RTLD_DEFAULT, "JVM_end_signal_setting"));
4491      get_signal_action = CAST_TO_FN_PTR(get_signal_t,
4492                                         dlsym(RTLD_DEFAULT, "JVM_get_signal_action"));
4493      libjsig_is_loaded = true;
4494      assert(UseSignalChaining, "should enable signal-chaining");
4495    }
4496    if (libjsig_is_loaded) {
4497      // Tell libjsig jvm is setting signal handlers
4498      (*begin_signal_setting)();
4499    }
4500
4501    set_signal_handler(SIGSEGV, true);
4502    set_signal_handler(SIGPIPE, true);
4503    set_signal_handler(SIGBUS, true);
4504    set_signal_handler(SIGILL, true);
4505    set_signal_handler(SIGFPE, true);
4506#if defined(PPC64)
4507    set_signal_handler(SIGTRAP, true);
4508#endif
4509    set_signal_handler(SIGXFSZ, true);
4510
4511    if (libjsig_is_loaded) {
4512      // Tell libjsig jvm finishes setting signal handlers
4513      (*end_signal_setting)();
4514    }
4515
4516    // We don't activate signal checker if libjsig is in place, we trust ourselves
4517    // and if UserSignalHandler is installed all bets are off.
4518    // Log that signal checking is off only if -verbose:jni is specified.
4519    if (CheckJNICalls) {
4520      if (libjsig_is_loaded) {
4521        if (PrintJNIResolving) {
4522          tty->print_cr("Info: libjsig is activated, all active signal checking is disabled");
4523        }
4524        check_signals = false;
4525      }
4526      if (AllowUserSignalHandlers) {
4527        if (PrintJNIResolving) {
4528          tty->print_cr("Info: AllowUserSignalHandlers is activated, all active signal checking is disabled");
4529        }
4530        check_signals = false;
4531      }
4532    }
4533  }
4534}
4535
4536// This is the fastest way to get thread cpu time on Linux.
4537// Returns cpu time (user+sys) for any thread, not only for current.
4538// POSIX compliant clocks are implemented in the kernels 2.6.16+.
4539// It might work on 2.6.10+ with a special kernel/glibc patch.
4540// For reference, please, see IEEE Std 1003.1-2004:
4541//   http://www.unix.org/single_unix_specification
4542
4543jlong os::Linux::fast_thread_cpu_time(clockid_t clockid) {
4544  struct timespec tp;
4545  int rc = os::Linux::clock_gettime(clockid, &tp);
4546  assert(rc == 0, "clock_gettime is expected to return 0 code");
4547
4548  return (tp.tv_sec * NANOSECS_PER_SEC) + tp.tv_nsec;
4549}
4550
4551void os::Linux::initialize_os_info() {
4552  assert(_os_version == 0, "OS info already initialized");
4553
4554  struct utsname _uname;
4555
4556  uint32_t major;
4557  uint32_t minor;
4558  uint32_t fix;
4559
4560  int rc;
4561
4562  // Kernel version is unknown if
4563  // verification below fails.
4564  _os_version = 0x01000000;
4565
4566  rc = uname(&_uname);
4567  if (rc != -1) {
4568
4569    rc = sscanf(_uname.release,"%d.%d.%d", &major, &minor, &fix);
4570    if (rc == 3) {
4571
4572      if (major < 256 && minor < 256 && fix < 256) {
4573        // Kernel version format is as expected,
4574        // set it overriding unknown state.
4575        _os_version = (major << 16) |
4576                      (minor << 8 ) |
4577                      (fix   << 0 ) ;
4578      }
4579    }
4580  }
4581}
4582
4583uint32_t os::Linux::os_version() {
4584  assert(_os_version != 0, "not initialized");
4585  return _os_version & 0x00FFFFFF;
4586}
4587
4588bool os::Linux::os_version_is_known() {
4589  assert(_os_version != 0, "not initialized");
4590  return _os_version & 0x01000000 ? false : true;
4591}
4592
4593/////
4594// glibc on Linux platform uses non-documented flag
4595// to indicate, that some special sort of signal
4596// trampoline is used.
4597// We will never set this flag, and we should
4598// ignore this flag in our diagnostic
4599#ifdef SIGNIFICANT_SIGNAL_MASK
4600  #undef SIGNIFICANT_SIGNAL_MASK
4601#endif
4602#define SIGNIFICANT_SIGNAL_MASK (~0x04000000)
4603
4604static const char* get_signal_handler_name(address handler,
4605                                           char* buf, int buflen) {
4606  int offset = 0;
4607  bool found = os::dll_address_to_library_name(handler, buf, buflen, &offset);
4608  if (found) {
4609    // skip directory names
4610    const char *p1, *p2;
4611    p1 = buf;
4612    size_t len = strlen(os::file_separator());
4613    while ((p2 = strstr(p1, os::file_separator())) != NULL) p1 = p2 + len;
4614    jio_snprintf(buf, buflen, "%s+0x%x", p1, offset);
4615  } else {
4616    jio_snprintf(buf, buflen, PTR_FORMAT, handler);
4617  }
4618  return buf;
4619}
4620
4621static void print_signal_handler(outputStream* st, int sig,
4622                                 char* buf, size_t buflen) {
4623  struct sigaction sa;
4624
4625  sigaction(sig, NULL, &sa);
4626
4627  // See comment for SIGNIFICANT_SIGNAL_MASK define
4628  sa.sa_flags &= SIGNIFICANT_SIGNAL_MASK;
4629
4630  st->print("%s: ", os::exception_name(sig, buf, buflen));
4631
4632  address handler = (sa.sa_flags & SA_SIGINFO)
4633    ? CAST_FROM_FN_PTR(address, sa.sa_sigaction)
4634    : CAST_FROM_FN_PTR(address, sa.sa_handler);
4635
4636  if (handler == CAST_FROM_FN_PTR(address, SIG_DFL)) {
4637    st->print("SIG_DFL");
4638  } else if (handler == CAST_FROM_FN_PTR(address, SIG_IGN)) {
4639    st->print("SIG_IGN");
4640  } else {
4641    st->print("[%s]", get_signal_handler_name(handler, buf, buflen));
4642  }
4643
4644  st->print(", sa_mask[0]=");
4645  os::Posix::print_signal_set_short(st, &sa.sa_mask);
4646
4647  address rh = VMError::get_resetted_sighandler(sig);
4648  // May be, handler was resetted by VMError?
4649  if (rh != NULL) {
4650    handler = rh;
4651    sa.sa_flags = VMError::get_resetted_sigflags(sig) & SIGNIFICANT_SIGNAL_MASK;
4652  }
4653
4654  st->print(", sa_flags=");
4655  os::Posix::print_sa_flags(st, sa.sa_flags);
4656
4657  // Check: is it our handler?
4658  if (handler == CAST_FROM_FN_PTR(address, (sa_sigaction_t)signalHandler) ||
4659      handler == CAST_FROM_FN_PTR(address, (sa_sigaction_t)SR_handler)) {
4660    // It is our signal handler
4661    // check for flags, reset system-used one!
4662    if ((int)sa.sa_flags != os::Linux::get_our_sigflags(sig)) {
4663      st->print(
4664                ", flags was changed from " PTR32_FORMAT ", consider using jsig library",
4665                os::Linux::get_our_sigflags(sig));
4666    }
4667  }
4668  st->cr();
4669}
4670
4671
4672#define DO_SIGNAL_CHECK(sig)                      \
4673  do {                                            \
4674    if (!sigismember(&check_signal_done, sig)) {  \
4675      os::Linux::check_signal_handler(sig);       \
4676    }                                             \
4677  } while (0)
4678
4679// This method is a periodic task to check for misbehaving JNI applications
4680// under CheckJNI, we can add any periodic checks here
4681
4682void os::run_periodic_checks() {
4683  if (check_signals == false) return;
4684
4685  // SEGV and BUS if overridden could potentially prevent
4686  // generation of hs*.log in the event of a crash, debugging
4687  // such a case can be very challenging, so we absolutely
4688  // check the following for a good measure:
4689  DO_SIGNAL_CHECK(SIGSEGV);
4690  DO_SIGNAL_CHECK(SIGILL);
4691  DO_SIGNAL_CHECK(SIGFPE);
4692  DO_SIGNAL_CHECK(SIGBUS);
4693  DO_SIGNAL_CHECK(SIGPIPE);
4694  DO_SIGNAL_CHECK(SIGXFSZ);
4695#if defined(PPC64)
4696  DO_SIGNAL_CHECK(SIGTRAP);
4697#endif
4698
4699  // ReduceSignalUsage allows the user to override these handlers
4700  // see comments at the very top and jvm_solaris.h
4701  if (!ReduceSignalUsage) {
4702    DO_SIGNAL_CHECK(SHUTDOWN1_SIGNAL);
4703    DO_SIGNAL_CHECK(SHUTDOWN2_SIGNAL);
4704    DO_SIGNAL_CHECK(SHUTDOWN3_SIGNAL);
4705    DO_SIGNAL_CHECK(BREAK_SIGNAL);
4706  }
4707
4708  DO_SIGNAL_CHECK(SR_signum);
4709}
4710
4711typedef int (*os_sigaction_t)(int, const struct sigaction *, struct sigaction *);
4712
4713static os_sigaction_t os_sigaction = NULL;
4714
4715void os::Linux::check_signal_handler(int sig) {
4716  char buf[O_BUFLEN];
4717  address jvmHandler = NULL;
4718
4719
4720  struct sigaction act;
4721  if (os_sigaction == NULL) {
4722    // only trust the default sigaction, in case it has been interposed
4723    os_sigaction = (os_sigaction_t)dlsym(RTLD_DEFAULT, "sigaction");
4724    if (os_sigaction == NULL) return;
4725  }
4726
4727  os_sigaction(sig, (struct sigaction*)NULL, &act);
4728
4729
4730  act.sa_flags &= SIGNIFICANT_SIGNAL_MASK;
4731
4732  address thisHandler = (act.sa_flags & SA_SIGINFO)
4733    ? CAST_FROM_FN_PTR(address, act.sa_sigaction)
4734    : CAST_FROM_FN_PTR(address, act.sa_handler);
4735
4736
4737  switch (sig) {
4738  case SIGSEGV:
4739  case SIGBUS:
4740  case SIGFPE:
4741  case SIGPIPE:
4742  case SIGILL:
4743  case SIGXFSZ:
4744    jvmHandler = CAST_FROM_FN_PTR(address, (sa_sigaction_t)signalHandler);
4745    break;
4746
4747  case SHUTDOWN1_SIGNAL:
4748  case SHUTDOWN2_SIGNAL:
4749  case SHUTDOWN3_SIGNAL:
4750  case BREAK_SIGNAL:
4751    jvmHandler = (address)user_handler();
4752    break;
4753
4754  default:
4755    if (sig == SR_signum) {
4756      jvmHandler = CAST_FROM_FN_PTR(address, (sa_sigaction_t)SR_handler);
4757    } else {
4758      return;
4759    }
4760    break;
4761  }
4762
4763  if (thisHandler != jvmHandler) {
4764    tty->print("Warning: %s handler ", exception_name(sig, buf, O_BUFLEN));
4765    tty->print("expected:%s", get_signal_handler_name(jvmHandler, buf, O_BUFLEN));
4766    tty->print_cr("  found:%s", get_signal_handler_name(thisHandler, buf, O_BUFLEN));
4767    // No need to check this sig any longer
4768    sigaddset(&check_signal_done, sig);
4769    // Running under non-interactive shell, SHUTDOWN2_SIGNAL will be reassigned SIG_IGN
4770    if (sig == SHUTDOWN2_SIGNAL && !isatty(fileno(stdin))) {
4771      tty->print_cr("Running in non-interactive shell, %s handler is replaced by shell",
4772                    exception_name(sig, buf, O_BUFLEN));
4773    }
4774  } else if(os::Linux::get_our_sigflags(sig) != 0 && (int)act.sa_flags != os::Linux::get_our_sigflags(sig)) {
4775    tty->print("Warning: %s handler flags ", exception_name(sig, buf, O_BUFLEN));
4776    tty->print("expected:");
4777    os::Posix::print_sa_flags(tty, os::Linux::get_our_sigflags(sig));
4778    tty->cr();
4779    tty->print("  found:");
4780    os::Posix::print_sa_flags(tty, act.sa_flags);
4781    tty->cr();
4782    // No need to check this sig any longer
4783    sigaddset(&check_signal_done, sig);
4784  }
4785
4786  // Dump all the signal
4787  if (sigismember(&check_signal_done, sig)) {
4788    print_signal_handlers(tty, buf, O_BUFLEN);
4789  }
4790}
4791
4792extern void report_error(char* file_name, int line_no, char* title,
4793                         char* format, ...);
4794
4795// this is called _before_ the most of global arguments have been parsed
4796void os::init(void) {
4797  char dummy;   // used to get a guess on initial stack address
4798//  first_hrtime = gethrtime();
4799
4800  clock_tics_per_sec = sysconf(_SC_CLK_TCK);
4801
4802  init_random(1234567);
4803
4804  ThreadCritical::initialize();
4805
4806  Linux::set_page_size(sysconf(_SC_PAGESIZE));
4807  if (Linux::page_size() == -1) {
4808    fatal("os_linux.cpp: os::init: sysconf failed (%s)",
4809          os::strerror(errno));
4810  }
4811  init_page_sizes((size_t) Linux::page_size());
4812
4813  Linux::initialize_system_info();
4814
4815  Linux::initialize_os_info();
4816
4817  // main_thread points to the aboriginal thread
4818  Linux::_main_thread = pthread_self();
4819
4820  Linux::clock_init();
4821  initial_time_count = javaTimeNanos();
4822
4823  // retrieve entry point for pthread_setname_np
4824  Linux::_pthread_setname_np =
4825    (int(*)(pthread_t, const char*))dlsym(RTLD_DEFAULT, "pthread_setname_np");
4826
4827  os::Posix::init();
4828}
4829
4830// To install functions for atexit system call
4831extern "C" {
4832  static void perfMemory_exit_helper() {
4833    perfMemory_exit();
4834  }
4835}
4836
4837// this is called _after_ the global arguments have been parsed
4838jint os::init_2(void) {
4839
4840  os::Posix::init_2();
4841
4842  Linux::fast_thread_clock_init();
4843
4844  // Allocate a single page and mark it as readable for safepoint polling
4845  address polling_page = (address) ::mmap(NULL, Linux::page_size(), PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
4846  guarantee(polling_page != MAP_FAILED, "os::init_2: failed to allocate polling page");
4847
4848  os::set_polling_page(polling_page);
4849  log_info(os)("SafePoint Polling address: " INTPTR_FORMAT, p2i(polling_page));
4850
4851  if (!UseMembar) {
4852    address mem_serialize_page = (address) ::mmap(NULL, Linux::page_size(), PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
4853    guarantee(mem_serialize_page != MAP_FAILED, "mmap Failed for memory serialize page");
4854    os::set_memory_serialize_page(mem_serialize_page);
4855    log_info(os)("Memory Serialize Page address: " INTPTR_FORMAT, p2i(mem_serialize_page));
4856  }
4857
4858  // initialize suspend/resume support - must do this before signal_sets_init()
4859  if (SR_initialize() != 0) {
4860    perror("SR_initialize failed");
4861    return JNI_ERR;
4862  }
4863
4864  Linux::signal_sets_init();
4865  Linux::install_signal_handlers();
4866
4867  // Check and sets minimum stack sizes against command line options
4868  if (Posix::set_minimum_stack_sizes() == JNI_ERR) {
4869    return JNI_ERR;
4870  }
4871  Linux::capture_initial_stack(JavaThread::stack_size_at_create());
4872
4873#if defined(IA32)
4874  workaround_expand_exec_shield_cs_limit();
4875#endif
4876
4877  Linux::libpthread_init();
4878  log_info(os)("HotSpot is running with %s, %s",
4879               Linux::glibc_version(), Linux::libpthread_version());
4880
4881  if (UseNUMA) {
4882    if (!Linux::libnuma_init()) {
4883      UseNUMA = false;
4884    } else {
4885      if ((Linux::numa_max_node() < 1)) {
4886        // There's only one node(they start from 0), disable NUMA.
4887        UseNUMA = false;
4888      }
4889    }
4890    // With SHM and HugeTLBFS large pages we cannot uncommit a page, so there's no way
4891    // we can make the adaptive lgrp chunk resizing work. If the user specified
4892    // both UseNUMA and UseLargePages (or UseSHM/UseHugeTLBFS) on the command line - warn and
4893    // disable adaptive resizing.
4894    if (UseNUMA && UseLargePages && !can_commit_large_page_memory()) {
4895      if (FLAG_IS_DEFAULT(UseNUMA)) {
4896        UseNUMA = false;
4897      } else {
4898        if (FLAG_IS_DEFAULT(UseLargePages) &&
4899            FLAG_IS_DEFAULT(UseSHM) &&
4900            FLAG_IS_DEFAULT(UseHugeTLBFS)) {
4901          UseLargePages = false;
4902        } else if (UseAdaptiveSizePolicy || UseAdaptiveNUMAChunkSizing) {
4903          warning("UseNUMA is not fully compatible with SHM/HugeTLBFS large pages, disabling adaptive resizing (-XX:-UseAdaptiveSizePolicy -XX:-UseAdaptiveNUMAChunkSizing)");
4904          UseAdaptiveSizePolicy = false;
4905          UseAdaptiveNUMAChunkSizing = false;
4906        }
4907      }
4908    }
4909    if (!UseNUMA && ForceNUMA) {
4910      UseNUMA = true;
4911    }
4912  }
4913
4914  if (MaxFDLimit) {
4915    // set the number of file descriptors to max. print out error
4916    // if getrlimit/setrlimit fails but continue regardless.
4917    struct rlimit nbr_files;
4918    int status = getrlimit(RLIMIT_NOFILE, &nbr_files);
4919    if (status != 0) {
4920      log_info(os)("os::init_2 getrlimit failed: %s", os::strerror(errno));
4921    } else {
4922      nbr_files.rlim_cur = nbr_files.rlim_max;
4923      status = setrlimit(RLIMIT_NOFILE, &nbr_files);
4924      if (status != 0) {
4925        log_info(os)("os::init_2 setrlimit failed: %s", os::strerror(errno));
4926      }
4927    }
4928  }
4929
4930  // Initialize lock used to serialize thread creation (see os::create_thread)
4931  Linux::set_createThread_lock(new Mutex(Mutex::leaf, "createThread_lock", false));
4932
4933  // at-exit methods are called in the reverse order of their registration.
4934  // atexit functions are called on return from main or as a result of a
4935  // call to exit(3C). There can be only 32 of these functions registered
4936  // and atexit() does not set errno.
4937
4938  if (PerfAllowAtExitRegistration) {
4939    // only register atexit functions if PerfAllowAtExitRegistration is set.
4940    // atexit functions can be delayed until process exit time, which
4941    // can be problematic for embedded VM situations. Embedded VMs should
4942    // call DestroyJavaVM() to assure that VM resources are released.
4943
4944    // note: perfMemory_exit_helper atexit function may be removed in
4945    // the future if the appropriate cleanup code can be added to the
4946    // VM_Exit VMOperation's doit method.
4947    if (atexit(perfMemory_exit_helper) != 0) {
4948      warning("os::init_2 atexit(perfMemory_exit_helper) failed");
4949    }
4950  }
4951
4952  // initialize thread priority policy
4953  prio_init();
4954
4955  return JNI_OK;
4956}
4957
4958// Mark the polling page as unreadable
4959void os::make_polling_page_unreadable(void) {
4960  if (!guard_memory((char*)_polling_page, Linux::page_size())) {
4961    fatal("Could not disable polling page");
4962  }
4963}
4964
4965// Mark the polling page as readable
4966void os::make_polling_page_readable(void) {
4967  if (!linux_mprotect((char *)_polling_page, Linux::page_size(), PROT_READ)) {
4968    fatal("Could not enable polling page");
4969  }
4970}
4971
4972// older glibc versions don't have this macro (which expands to
4973// an optimized bit-counting function) so we have to roll our own
4974#ifndef CPU_COUNT
4975
4976static int _cpu_count(const cpu_set_t* cpus) {
4977  int count = 0;
4978  // only look up to the number of configured processors
4979  for (int i = 0; i < os::processor_count(); i++) {
4980    if (CPU_ISSET(i, cpus)) {
4981      count++;
4982    }
4983  }
4984  return count;
4985}
4986
4987#define CPU_COUNT(cpus) _cpu_count(cpus)
4988
4989#endif // CPU_COUNT
4990
4991// Get the current number of available processors for this process.
4992// This value can change at any time during a process's lifetime.
4993// sched_getaffinity gives an accurate answer as it accounts for cpusets.
4994// If it appears there may be more than 1024 processors then we do a
4995// dynamic check - see 6515172 for details.
4996// If anything goes wrong we fallback to returning the number of online
4997// processors - which can be greater than the number available to the process.
4998int os::active_processor_count() {
4999  cpu_set_t cpus;  // can represent at most 1024 (CPU_SETSIZE) processors
5000  cpu_set_t* cpus_p = &cpus;
5001  int cpus_size = sizeof(cpu_set_t);
5002
5003  int configured_cpus = processor_count();  // upper bound on available cpus
5004  int cpu_count = 0;
5005
5006// old build platforms may not support dynamic cpu sets
5007#ifdef CPU_ALLOC
5008
5009  // To enable easy testing of the dynamic path on different platforms we
5010  // introduce a diagnostic flag: UseCpuAllocPath
5011  if (configured_cpus >= CPU_SETSIZE || UseCpuAllocPath) {
5012    // kernel may use a mask bigger than cpu_set_t
5013    log_trace(os)("active_processor_count: using dynamic path %s"
5014                  "- configured processors: %d",
5015                  UseCpuAllocPath ? "(forced) " : "",
5016                  configured_cpus);
5017    cpus_p = CPU_ALLOC(configured_cpus);
5018    if (cpus_p != NULL) {
5019      cpus_size = CPU_ALLOC_SIZE(configured_cpus);
5020      // zero it just to be safe
5021      CPU_ZERO_S(cpus_size, cpus_p);
5022    }
5023    else {
5024       // failed to allocate so fallback to online cpus
5025       int online_cpus = ::sysconf(_SC_NPROCESSORS_ONLN);
5026       log_trace(os)("active_processor_count: "
5027                     "CPU_ALLOC failed (%s) - using "
5028                     "online processor count: %d",
5029                     os::strerror(errno), online_cpus);
5030       return online_cpus;
5031    }
5032  }
5033  else {
5034    log_trace(os)("active_processor_count: using static path - configured processors: %d",
5035                  configured_cpus);
5036  }
5037#else // CPU_ALLOC
5038// these stubs won't be executed
5039#define CPU_COUNT_S(size, cpus) -1
5040#define CPU_FREE(cpus)
5041
5042  log_trace(os)("active_processor_count: only static path available - configured processors: %d",
5043                configured_cpus);
5044#endif // CPU_ALLOC
5045
5046  // pid 0 means the current thread - which we have to assume represents the process
5047  if (sched_getaffinity(0, cpus_size, cpus_p) == 0) {
5048    if (cpus_p != &cpus) { // can only be true when CPU_ALLOC used
5049      cpu_count = CPU_COUNT_S(cpus_size, cpus_p);
5050    }
5051    else {
5052      cpu_count = CPU_COUNT(cpus_p);
5053    }
5054    log_trace(os)("active_processor_count: sched_getaffinity processor count: %d", cpu_count);
5055  }
5056  else {
5057    cpu_count = ::sysconf(_SC_NPROCESSORS_ONLN);
5058    warning("sched_getaffinity failed (%s)- using online processor count (%d) "
5059            "which may exceed available processors", os::strerror(errno), cpu_count);
5060  }
5061
5062  if (cpus_p != &cpus) { // can only be true when CPU_ALLOC used
5063    CPU_FREE(cpus_p);
5064  }
5065
5066  assert(cpu_count > 0 && cpu_count <= processor_count(), "sanity check");
5067  return cpu_count;
5068}
5069
5070void os::set_native_thread_name(const char *name) {
5071  if (Linux::_pthread_setname_np) {
5072    char buf [16]; // according to glibc manpage, 16 chars incl. '/0'
5073    snprintf(buf, sizeof(buf), "%s", name);
5074    buf[sizeof(buf) - 1] = '\0';
5075    const int rc = Linux::_pthread_setname_np(pthread_self(), buf);
5076    // ERANGE should not happen; all other errors should just be ignored.
5077    assert(rc != ERANGE, "pthread_setname_np failed");
5078  }
5079}
5080
5081bool os::distribute_processes(uint length, uint* distribution) {
5082  // Not yet implemented.
5083  return false;
5084}
5085
5086bool os::bind_to_processor(uint processor_id) {
5087  // Not yet implemented.
5088  return false;
5089}
5090
5091///
5092
5093void os::SuspendedThreadTask::internal_do_task() {
5094  if (do_suspend(_thread->osthread())) {
5095    SuspendedThreadTaskContext context(_thread, _thread->osthread()->ucontext());
5096    do_task(context);
5097    do_resume(_thread->osthread());
5098  }
5099}
5100
5101class PcFetcher : public os::SuspendedThreadTask {
5102 public:
5103  PcFetcher(Thread* thread) : os::SuspendedThreadTask(thread) {}
5104  ExtendedPC result();
5105 protected:
5106  void do_task(const os::SuspendedThreadTaskContext& context);
5107 private:
5108  ExtendedPC _epc;
5109};
5110
5111ExtendedPC PcFetcher::result() {
5112  guarantee(is_done(), "task is not done yet.");
5113  return _epc;
5114}
5115
5116void PcFetcher::do_task(const os::SuspendedThreadTaskContext& context) {
5117  Thread* thread = context.thread();
5118  OSThread* osthread = thread->osthread();
5119  if (osthread->ucontext() != NULL) {
5120    _epc = os::Linux::ucontext_get_pc((const ucontext_t *) context.ucontext());
5121  } else {
5122    // NULL context is unexpected, double-check this is the VMThread
5123    guarantee(thread->is_VM_thread(), "can only be called for VMThread");
5124  }
5125}
5126
5127// Suspends the target using the signal mechanism and then grabs the PC before
5128// resuming the target. Used by the flat-profiler only
5129ExtendedPC os::get_thread_pc(Thread* thread) {
5130  // Make sure that it is called by the watcher for the VMThread
5131  assert(Thread::current()->is_Watcher_thread(), "Must be watcher");
5132  assert(thread->is_VM_thread(), "Can only be called for VMThread");
5133
5134  PcFetcher fetcher(thread);
5135  fetcher.run();
5136  return fetcher.result();
5137}
5138
5139////////////////////////////////////////////////////////////////////////////////
5140// debug support
5141
5142bool os::find(address addr, outputStream* st) {
5143  Dl_info dlinfo;
5144  memset(&dlinfo, 0, sizeof(dlinfo));
5145  if (dladdr(addr, &dlinfo) != 0) {
5146    st->print(PTR_FORMAT ": ", p2i(addr));
5147    if (dlinfo.dli_sname != NULL && dlinfo.dli_saddr != NULL) {
5148      st->print("%s+" PTR_FORMAT, dlinfo.dli_sname,
5149                p2i(addr) - p2i(dlinfo.dli_saddr));
5150    } else if (dlinfo.dli_fbase != NULL) {
5151      st->print("<offset " PTR_FORMAT ">", p2i(addr) - p2i(dlinfo.dli_fbase));
5152    } else {
5153      st->print("<absolute address>");
5154    }
5155    if (dlinfo.dli_fname != NULL) {
5156      st->print(" in %s", dlinfo.dli_fname);
5157    }
5158    if (dlinfo.dli_fbase != NULL) {
5159      st->print(" at " PTR_FORMAT, p2i(dlinfo.dli_fbase));
5160    }
5161    st->cr();
5162
5163    if (Verbose) {
5164      // decode some bytes around the PC
5165      address begin = clamp_address_in_page(addr-40, addr, os::vm_page_size());
5166      address end   = clamp_address_in_page(addr+40, addr, os::vm_page_size());
5167      address       lowest = (address) dlinfo.dli_sname;
5168      if (!lowest)  lowest = (address) dlinfo.dli_fbase;
5169      if (begin < lowest)  begin = lowest;
5170      Dl_info dlinfo2;
5171      if (dladdr(end, &dlinfo2) != 0 && dlinfo2.dli_saddr != dlinfo.dli_saddr
5172          && end > dlinfo2.dli_saddr && dlinfo2.dli_saddr > begin) {
5173        end = (address) dlinfo2.dli_saddr;
5174      }
5175      Disassembler::decode(begin, end, st);
5176    }
5177    return true;
5178  }
5179  return false;
5180}
5181
5182////////////////////////////////////////////////////////////////////////////////
5183// misc
5184
5185// This does not do anything on Linux. This is basically a hook for being
5186// able to use structured exception handling (thread-local exception filters)
5187// on, e.g., Win32.
5188void
5189os::os_exception_wrapper(java_call_t f, JavaValue* value, const methodHandle& method,
5190                         JavaCallArguments* args, Thread* thread) {
5191  f(value, method, args, thread);
5192}
5193
5194void os::print_statistics() {
5195}
5196
5197bool os::message_box(const char* title, const char* message) {
5198  int i;
5199  fdStream err(defaultStream::error_fd());
5200  for (i = 0; i < 78; i++) err.print_raw("=");
5201  err.cr();
5202  err.print_raw_cr(title);
5203  for (i = 0; i < 78; i++) err.print_raw("-");
5204  err.cr();
5205  err.print_raw_cr(message);
5206  for (i = 0; i < 78; i++) err.print_raw("=");
5207  err.cr();
5208
5209  char buf[16];
5210  // Prevent process from exiting upon "read error" without consuming all CPU
5211  while (::read(0, buf, sizeof(buf)) <= 0) { ::sleep(100); }
5212
5213  return buf[0] == 'y' || buf[0] == 'Y';
5214}
5215
5216int os::stat(const char *path, struct stat *sbuf) {
5217  char pathbuf[MAX_PATH];
5218  if (strlen(path) > MAX_PATH - 1) {
5219    errno = ENAMETOOLONG;
5220    return -1;
5221  }
5222  os::native_path(strcpy(pathbuf, path));
5223  return ::stat(pathbuf, sbuf);
5224}
5225
5226// Is a (classpath) directory empty?
5227bool os::dir_is_empty(const char* path) {
5228  DIR *dir = NULL;
5229  struct dirent *ptr;
5230
5231  dir = opendir(path);
5232  if (dir == NULL) return true;
5233
5234  // Scan the directory
5235  bool result = true;
5236  char buf[sizeof(struct dirent) + MAX_PATH];
5237  while (result && (ptr = ::readdir(dir)) != NULL) {
5238    if (strcmp(ptr->d_name, ".") != 0 && strcmp(ptr->d_name, "..") != 0) {
5239      result = false;
5240    }
5241  }
5242  closedir(dir);
5243  return result;
5244}
5245
5246// This code originates from JDK's sysOpen and open64_w
5247// from src/solaris/hpi/src/system_md.c
5248
5249int os::open(const char *path, int oflag, int mode) {
5250  if (strlen(path) > MAX_PATH - 1) {
5251    errno = ENAMETOOLONG;
5252    return -1;
5253  }
5254
5255  // All file descriptors that are opened in the Java process and not
5256  // specifically destined for a subprocess should have the close-on-exec
5257  // flag set.  If we don't set it, then careless 3rd party native code
5258  // might fork and exec without closing all appropriate file descriptors
5259  // (e.g. as we do in closeDescriptors in UNIXProcess.c), and this in
5260  // turn might:
5261  //
5262  // - cause end-of-file to fail to be detected on some file
5263  //   descriptors, resulting in mysterious hangs, or
5264  //
5265  // - might cause an fopen in the subprocess to fail on a system
5266  //   suffering from bug 1085341.
5267  //
5268  // (Yes, the default setting of the close-on-exec flag is a Unix
5269  // design flaw)
5270  //
5271  // See:
5272  // 1085341: 32-bit stdio routines should support file descriptors >255
5273  // 4843136: (process) pipe file descriptor from Runtime.exec not being closed
5274  // 6339493: (process) Runtime.exec does not close all file descriptors on Solaris 9
5275  //
5276  // Modern Linux kernels (after 2.6.23 2007) support O_CLOEXEC with open().
5277  // O_CLOEXEC is preferable to using FD_CLOEXEC on an open file descriptor
5278  // because it saves a system call and removes a small window where the flag
5279  // is unset.  On ancient Linux kernels the O_CLOEXEC flag will be ignored
5280  // and we fall back to using FD_CLOEXEC (see below).
5281#ifdef O_CLOEXEC
5282  oflag |= O_CLOEXEC;
5283#endif
5284
5285  int fd = ::open64(path, oflag, mode);
5286  if (fd == -1) return -1;
5287
5288  //If the open succeeded, the file might still be a directory
5289  {
5290    struct stat64 buf64;
5291    int ret = ::fstat64(fd, &buf64);
5292    int st_mode = buf64.st_mode;
5293
5294    if (ret != -1) {
5295      if ((st_mode & S_IFMT) == S_IFDIR) {
5296        errno = EISDIR;
5297        ::close(fd);
5298        return -1;
5299      }
5300    } else {
5301      ::close(fd);
5302      return -1;
5303    }
5304  }
5305
5306#ifdef FD_CLOEXEC
5307  // Validate that the use of the O_CLOEXEC flag on open above worked.
5308  // With recent kernels, we will perform this check exactly once.
5309  static sig_atomic_t O_CLOEXEC_is_known_to_work = 0;
5310  if (!O_CLOEXEC_is_known_to_work) {
5311    int flags = ::fcntl(fd, F_GETFD);
5312    if (flags != -1) {
5313      if ((flags & FD_CLOEXEC) != 0)
5314        O_CLOEXEC_is_known_to_work = 1;
5315      else
5316        ::fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
5317    }
5318  }
5319#endif
5320
5321  return fd;
5322}
5323
5324
5325// create binary file, rewriting existing file if required
5326int os::create_binary_file(const char* path, bool rewrite_existing) {
5327  int oflags = O_WRONLY | O_CREAT;
5328  if (!rewrite_existing) {
5329    oflags |= O_EXCL;
5330  }
5331  return ::open64(path, oflags, S_IREAD | S_IWRITE);
5332}
5333
5334// return current position of file pointer
5335jlong os::current_file_offset(int fd) {
5336  return (jlong)::lseek64(fd, (off64_t)0, SEEK_CUR);
5337}
5338
5339// move file pointer to the specified offset
5340jlong os::seek_to_file_offset(int fd, jlong offset) {
5341  return (jlong)::lseek64(fd, (off64_t)offset, SEEK_SET);
5342}
5343
5344// This code originates from JDK's sysAvailable
5345// from src/solaris/hpi/src/native_threads/src/sys_api_td.c
5346
5347int os::available(int fd, jlong *bytes) {
5348  jlong cur, end;
5349  int mode;
5350  struct stat64 buf64;
5351
5352  if (::fstat64(fd, &buf64) >= 0) {
5353    mode = buf64.st_mode;
5354    if (S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) {
5355      int n;
5356      if (::ioctl(fd, FIONREAD, &n) >= 0) {
5357        *bytes = n;
5358        return 1;
5359      }
5360    }
5361  }
5362  if ((cur = ::lseek64(fd, 0L, SEEK_CUR)) == -1) {
5363    return 0;
5364  } else if ((end = ::lseek64(fd, 0L, SEEK_END)) == -1) {
5365    return 0;
5366  } else if (::lseek64(fd, cur, SEEK_SET) == -1) {
5367    return 0;
5368  }
5369  *bytes = end - cur;
5370  return 1;
5371}
5372
5373// Map a block of memory.
5374char* os::pd_map_memory(int fd, const char* file_name, size_t file_offset,
5375                        char *addr, size_t bytes, bool read_only,
5376                        bool allow_exec) {
5377  int prot;
5378  int flags = MAP_PRIVATE;
5379
5380  if (read_only) {
5381    prot = PROT_READ;
5382  } else {
5383    prot = PROT_READ | PROT_WRITE;
5384  }
5385
5386  if (allow_exec) {
5387    prot |= PROT_EXEC;
5388  }
5389
5390  if (addr != NULL) {
5391    flags |= MAP_FIXED;
5392  }
5393
5394  char* mapped_address = (char*)mmap(addr, (size_t)bytes, prot, flags,
5395                                     fd, file_offset);
5396  if (mapped_address == MAP_FAILED) {
5397    return NULL;
5398  }
5399  return mapped_address;
5400}
5401
5402
5403// Remap a block of memory.
5404char* os::pd_remap_memory(int fd, const char* file_name, size_t file_offset,
5405                          char *addr, size_t bytes, bool read_only,
5406                          bool allow_exec) {
5407  // same as map_memory() on this OS
5408  return os::map_memory(fd, file_name, file_offset, addr, bytes, read_only,
5409                        allow_exec);
5410}
5411
5412
5413// Unmap a block of memory.
5414bool os::pd_unmap_memory(char* addr, size_t bytes) {
5415  return munmap(addr, bytes) == 0;
5416}
5417
5418static jlong slow_thread_cpu_time(Thread *thread, bool user_sys_cpu_time);
5419
5420static clockid_t thread_cpu_clockid(Thread* thread) {
5421  pthread_t tid = thread->osthread()->pthread_id();
5422  clockid_t clockid;
5423
5424  // Get thread clockid
5425  int rc = os::Linux::pthread_getcpuclockid(tid, &clockid);
5426  assert(rc == 0, "pthread_getcpuclockid is expected to return 0 code");
5427  return clockid;
5428}
5429
5430// current_thread_cpu_time(bool) and thread_cpu_time(Thread*, bool)
5431// are used by JVM M&M and JVMTI to get user+sys or user CPU time
5432// of a thread.
5433//
5434// current_thread_cpu_time() and thread_cpu_time(Thread*) returns
5435// the fast estimate available on the platform.
5436
5437jlong os::current_thread_cpu_time() {
5438  if (os::Linux::supports_fast_thread_cpu_time()) {
5439    return os::Linux::fast_thread_cpu_time(CLOCK_THREAD_CPUTIME_ID);
5440  } else {
5441    // return user + sys since the cost is the same
5442    return slow_thread_cpu_time(Thread::current(), true /* user + sys */);
5443  }
5444}
5445
5446jlong os::thread_cpu_time(Thread* thread) {
5447  // consistent with what current_thread_cpu_time() returns
5448  if (os::Linux::supports_fast_thread_cpu_time()) {
5449    return os::Linux::fast_thread_cpu_time(thread_cpu_clockid(thread));
5450  } else {
5451    return slow_thread_cpu_time(thread, true /* user + sys */);
5452  }
5453}
5454
5455jlong os::current_thread_cpu_time(bool user_sys_cpu_time) {
5456  if (user_sys_cpu_time && os::Linux::supports_fast_thread_cpu_time()) {
5457    return os::Linux::fast_thread_cpu_time(CLOCK_THREAD_CPUTIME_ID);
5458  } else {
5459    return slow_thread_cpu_time(Thread::current(), user_sys_cpu_time);
5460  }
5461}
5462
5463jlong os::thread_cpu_time(Thread *thread, bool user_sys_cpu_time) {
5464  if (user_sys_cpu_time && os::Linux::supports_fast_thread_cpu_time()) {
5465    return os::Linux::fast_thread_cpu_time(thread_cpu_clockid(thread));
5466  } else {
5467    return slow_thread_cpu_time(thread, user_sys_cpu_time);
5468  }
5469}
5470
5471//  -1 on error.
5472static jlong slow_thread_cpu_time(Thread *thread, bool user_sys_cpu_time) {
5473  pid_t  tid = thread->osthread()->thread_id();
5474  char *s;
5475  char stat[2048];
5476  int statlen;
5477  char proc_name[64];
5478  int count;
5479  long sys_time, user_time;
5480  char cdummy;
5481  int idummy;
5482  long ldummy;
5483  FILE *fp;
5484
5485  snprintf(proc_name, 64, "/proc/self/task/%d/stat", tid);
5486  fp = fopen(proc_name, "r");
5487  if (fp == NULL) return -1;
5488  statlen = fread(stat, 1, 2047, fp);
5489  stat[statlen] = '\0';
5490  fclose(fp);
5491
5492  // Skip pid and the command string. Note that we could be dealing with
5493  // weird command names, e.g. user could decide to rename java launcher
5494  // to "java 1.4.2 :)", then the stat file would look like
5495  //                1234 (java 1.4.2 :)) R ... ...
5496  // We don't really need to know the command string, just find the last
5497  // occurrence of ")" and then start parsing from there. See bug 4726580.
5498  s = strrchr(stat, ')');
5499  if (s == NULL) return -1;
5500
5501  // Skip blank chars
5502  do { s++; } while (s && isspace(*s));
5503
5504  count = sscanf(s,"%c %d %d %d %d %d %lu %lu %lu %lu %lu %lu %lu",
5505                 &cdummy, &idummy, &idummy, &idummy, &idummy, &idummy,
5506                 &ldummy, &ldummy, &ldummy, &ldummy, &ldummy,
5507                 &user_time, &sys_time);
5508  if (count != 13) return -1;
5509  if (user_sys_cpu_time) {
5510    return ((jlong)sys_time + (jlong)user_time) * (1000000000 / clock_tics_per_sec);
5511  } else {
5512    return (jlong)user_time * (1000000000 / clock_tics_per_sec);
5513  }
5514}
5515
5516void os::current_thread_cpu_time_info(jvmtiTimerInfo *info_ptr) {
5517  info_ptr->max_value = ALL_64_BITS;       // will not wrap in less than 64 bits
5518  info_ptr->may_skip_backward = false;     // elapsed time not wall time
5519  info_ptr->may_skip_forward = false;      // elapsed time not wall time
5520  info_ptr->kind = JVMTI_TIMER_TOTAL_CPU;  // user+system time is returned
5521}
5522
5523void os::thread_cpu_time_info(jvmtiTimerInfo *info_ptr) {
5524  info_ptr->max_value = ALL_64_BITS;       // will not wrap in less than 64 bits
5525  info_ptr->may_skip_backward = false;     // elapsed time not wall time
5526  info_ptr->may_skip_forward = false;      // elapsed time not wall time
5527  info_ptr->kind = JVMTI_TIMER_TOTAL_CPU;  // user+system time is returned
5528}
5529
5530bool os::is_thread_cpu_time_supported() {
5531  return true;
5532}
5533
5534// System loadavg support.  Returns -1 if load average cannot be obtained.
5535// Linux doesn't yet have a (official) notion of processor sets,
5536// so just return the system wide load average.
5537int os::loadavg(double loadavg[], int nelem) {
5538  return ::getloadavg(loadavg, nelem);
5539}
5540
5541void os::pause() {
5542  char filename[MAX_PATH];
5543  if (PauseAtStartupFile && PauseAtStartupFile[0]) {
5544    jio_snprintf(filename, MAX_PATH, "%s", PauseAtStartupFile);
5545  } else {
5546    jio_snprintf(filename, MAX_PATH, "./vm.paused.%d", current_process_id());
5547  }
5548
5549  int fd = ::open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0666);
5550  if (fd != -1) {
5551    struct stat buf;
5552    ::close(fd);
5553    while (::stat(filename, &buf) == 0) {
5554      (void)::poll(NULL, 0, 100);
5555    }
5556  } else {
5557    jio_fprintf(stderr,
5558                "Could not open pause file '%s', continuing immediately.\n", filename);
5559  }
5560}
5561
5562extern char** environ;
5563
5564// Run the specified command in a separate process. Return its exit value,
5565// or -1 on failure (e.g. can't fork a new process).
5566// Unlike system(), this function can be called from signal handler. It
5567// doesn't block SIGINT et al.
5568int os::fork_and_exec(char* cmd) {
5569  const char * argv[4] = {"sh", "-c", cmd, NULL};
5570
5571  pid_t pid = fork();
5572
5573  if (pid < 0) {
5574    // fork failed
5575    return -1;
5576
5577  } else if (pid == 0) {
5578    // child process
5579
5580    execve("/bin/sh", (char* const*)argv, environ);
5581
5582    // execve failed
5583    _exit(-1);
5584
5585  } else  {
5586    // copied from J2SE ..._waitForProcessExit() in UNIXProcess_md.c; we don't
5587    // care about the actual exit code, for now.
5588
5589    int status;
5590
5591    // Wait for the child process to exit.  This returns immediately if
5592    // the child has already exited. */
5593    while (waitpid(pid, &status, 0) < 0) {
5594      switch (errno) {
5595      case ECHILD: return 0;
5596      case EINTR: break;
5597      default: return -1;
5598      }
5599    }
5600
5601    if (WIFEXITED(status)) {
5602      // The child exited normally; get its exit code.
5603      return WEXITSTATUS(status);
5604    } else if (WIFSIGNALED(status)) {
5605      // The child exited because of a signal
5606      // The best value to return is 0x80 + signal number,
5607      // because that is what all Unix shells do, and because
5608      // it allows callers to distinguish between process exit and
5609      // process death by signal.
5610      return 0x80 + WTERMSIG(status);
5611    } else {
5612      // Unknown exit code; pass it through
5613      return status;
5614    }
5615  }
5616}
5617
5618// is_headless_jre()
5619//
5620// Test for the existence of xawt/libmawt.so or libawt_xawt.so
5621// in order to report if we are running in a headless jre
5622//
5623// Since JDK8 xawt/libmawt.so was moved into the same directory
5624// as libawt.so, and renamed libawt_xawt.so
5625//
5626bool os::is_headless_jre() {
5627  struct stat statbuf;
5628  char buf[MAXPATHLEN];
5629  char libmawtpath[MAXPATHLEN];
5630  const char *xawtstr  = "/xawt/libmawt.so";
5631  const char *new_xawtstr = "/libawt_xawt.so";
5632  char *p;
5633
5634  // Get path to libjvm.so
5635  os::jvm_path(buf, sizeof(buf));
5636
5637  // Get rid of libjvm.so
5638  p = strrchr(buf, '/');
5639  if (p == NULL) {
5640    return false;
5641  } else {
5642    *p = '\0';
5643  }
5644
5645  // Get rid of client or server
5646  p = strrchr(buf, '/');
5647  if (p == NULL) {
5648    return false;
5649  } else {
5650    *p = '\0';
5651  }
5652
5653  // check xawt/libmawt.so
5654  strcpy(libmawtpath, buf);
5655  strcat(libmawtpath, xawtstr);
5656  if (::stat(libmawtpath, &statbuf) == 0) return false;
5657
5658  // check libawt_xawt.so
5659  strcpy(libmawtpath, buf);
5660  strcat(libmawtpath, new_xawtstr);
5661  if (::stat(libmawtpath, &statbuf) == 0) return false;
5662
5663  return true;
5664}
5665
5666// Get the default path to the core file
5667// Returns the length of the string
5668int os::get_core_path(char* buffer, size_t bufferSize) {
5669  /*
5670   * Max length of /proc/sys/kernel/core_pattern is 128 characters.
5671   * See https://www.kernel.org/doc/Documentation/sysctl/kernel.txt
5672   */
5673  const int core_pattern_len = 129;
5674  char core_pattern[core_pattern_len] = {0};
5675
5676  int core_pattern_file = ::open("/proc/sys/kernel/core_pattern", O_RDONLY);
5677  if (core_pattern_file == -1) {
5678    return -1;
5679  }
5680
5681  ssize_t ret = ::read(core_pattern_file, core_pattern, core_pattern_len);
5682  ::close(core_pattern_file);
5683  if (ret <= 0 || ret >= core_pattern_len || core_pattern[0] == '\n') {
5684    return -1;
5685  }
5686  if (core_pattern[ret-1] == '\n') {
5687    core_pattern[ret-1] = '\0';
5688  } else {
5689    core_pattern[ret] = '\0';
5690  }
5691
5692  char *pid_pos = strstr(core_pattern, "%p");
5693  int written;
5694
5695  if (core_pattern[0] == '/') {
5696    written = jio_snprintf(buffer, bufferSize, "%s", core_pattern);
5697  } else {
5698    char cwd[PATH_MAX];
5699
5700    const char* p = get_current_directory(cwd, PATH_MAX);
5701    if (p == NULL) {
5702      return -1;
5703    }
5704
5705    if (core_pattern[0] == '|') {
5706      written = jio_snprintf(buffer, bufferSize,
5707                             "\"%s\" (or dumping to %s/core.%d)",
5708                             &core_pattern[1], p, current_process_id());
5709    } else {
5710      written = jio_snprintf(buffer, bufferSize, "%s/%s", p, core_pattern);
5711    }
5712  }
5713
5714  if (written < 0) {
5715    return -1;
5716  }
5717
5718  if (((size_t)written < bufferSize) && (pid_pos == NULL) && (core_pattern[0] != '|')) {
5719    int core_uses_pid_file = ::open("/proc/sys/kernel/core_uses_pid", O_RDONLY);
5720
5721    if (core_uses_pid_file != -1) {
5722      char core_uses_pid = 0;
5723      ssize_t ret = ::read(core_uses_pid_file, &core_uses_pid, 1);
5724      ::close(core_uses_pid_file);
5725
5726      if (core_uses_pid == '1') {
5727        jio_snprintf(buffer + written, bufferSize - written,
5728                                          ".%d", current_process_id());
5729      }
5730    }
5731  }
5732
5733  return strlen(buffer);
5734}
5735
5736bool os::start_debugging(char *buf, int buflen) {
5737  int len = (int)strlen(buf);
5738  char *p = &buf[len];
5739
5740  jio_snprintf(p, buflen-len,
5741               "\n\n"
5742               "Do you want to debug the problem?\n\n"
5743               "To debug, run 'gdb /proc/%d/exe %d'; then switch to thread " UINTX_FORMAT " (" INTPTR_FORMAT ")\n"
5744               "Enter 'yes' to launch gdb automatically (PATH must include gdb)\n"
5745               "Otherwise, press RETURN to abort...",
5746               os::current_process_id(), os::current_process_id(),
5747               os::current_thread_id(), os::current_thread_id());
5748
5749  bool yes = os::message_box("Unexpected Error", buf);
5750
5751  if (yes) {
5752    // yes, user asked VM to launch debugger
5753    jio_snprintf(buf, sizeof(char)*buflen, "gdb /proc/%d/exe %d",
5754                 os::current_process_id(), os::current_process_id());
5755
5756    os::fork_and_exec(buf);
5757    yes = false;
5758  }
5759  return yes;
5760}
5761
5762
5763// Java/Compiler thread:
5764//
5765//   Low memory addresses
5766// P0 +------------------------+
5767//    |                        |\  Java thread created by VM does not have glibc
5768//    |    glibc guard page    | - guard page, attached Java thread usually has
5769//    |                        |/  1 glibc guard page.
5770// P1 +------------------------+ Thread::stack_base() - Thread::stack_size()
5771//    |                        |\
5772//    |  HotSpot Guard Pages   | - red, yellow and reserved pages
5773//    |                        |/
5774//    +------------------------+ JavaThread::stack_reserved_zone_base()
5775//    |                        |\
5776//    |      Normal Stack      | -
5777//    |                        |/
5778// P2 +------------------------+ Thread::stack_base()
5779//
5780// Non-Java thread:
5781//
5782//   Low memory addresses
5783// P0 +------------------------+
5784//    |                        |\
5785//    |  glibc guard page      | - usually 1 page
5786//    |                        |/
5787// P1 +------------------------+ Thread::stack_base() - Thread::stack_size()
5788//    |                        |\
5789//    |      Normal Stack      | -
5790//    |                        |/
5791// P2 +------------------------+ Thread::stack_base()
5792//
5793// ** P1 (aka bottom) and size (P2 = P1 - size) are the address and stack size
5794//    returned from pthread_attr_getstack().
5795// ** Due to NPTL implementation error, linux takes the glibc guard page out
5796//    of the stack size given in pthread_attr. We work around this for
5797//    threads created by the VM. (We adapt bottom to be P1 and size accordingly.)
5798//
5799#ifndef ZERO
5800static void current_stack_region(address * bottom, size_t * size) {
5801  if (os::Linux::is_initial_thread()) {
5802    // initial thread needs special handling because pthread_getattr_np()
5803    // may return bogus value.
5804    *bottom = os::Linux::initial_thread_stack_bottom();
5805    *size   = os::Linux::initial_thread_stack_size();
5806  } else {
5807    pthread_attr_t attr;
5808
5809    int rslt = pthread_getattr_np(pthread_self(), &attr);
5810
5811    // JVM needs to know exact stack location, abort if it fails
5812    if (rslt != 0) {
5813      if (rslt == ENOMEM) {
5814        vm_exit_out_of_memory(0, OOM_MMAP_ERROR, "pthread_getattr_np");
5815      } else {
5816        fatal("pthread_getattr_np failed with error = %d", rslt);
5817      }
5818    }
5819
5820    if (pthread_attr_getstack(&attr, (void **)bottom, size) != 0) {
5821      fatal("Cannot locate current stack attributes!");
5822    }
5823
5824    // Work around NPTL stack guard error.
5825    size_t guard_size = 0;
5826    rslt = pthread_attr_getguardsize(&attr, &guard_size);
5827    if (rslt != 0) {
5828      fatal("pthread_attr_getguardsize failed with error = %d", rslt);
5829    }
5830    *bottom += guard_size;
5831    *size   -= guard_size;
5832
5833    pthread_attr_destroy(&attr);
5834
5835  }
5836  assert(os::current_stack_pointer() >= *bottom &&
5837         os::current_stack_pointer() < *bottom + *size, "just checking");
5838}
5839
5840address os::current_stack_base() {
5841  address bottom;
5842  size_t size;
5843  current_stack_region(&bottom, &size);
5844  return (bottom + size);
5845}
5846
5847size_t os::current_stack_size() {
5848  // This stack size includes the usable stack and HotSpot guard pages
5849  // (for the threads that have Hotspot guard pages).
5850  address bottom;
5851  size_t size;
5852  current_stack_region(&bottom, &size);
5853  return size;
5854}
5855#endif
5856
5857static inline struct timespec get_mtime(const char* filename) {
5858  struct stat st;
5859  int ret = os::stat(filename, &st);
5860  assert(ret == 0, "failed to stat() file '%s': %s", filename, strerror(errno));
5861  return st.st_mtim;
5862}
5863
5864int os::compare_file_modified_times(const char* file1, const char* file2) {
5865  struct timespec filetime1 = get_mtime(file1);
5866  struct timespec filetime2 = get_mtime(file2);
5867  int diff = filetime1.tv_sec - filetime2.tv_sec;
5868  if (diff == 0) {
5869    return filetime1.tv_nsec - filetime2.tv_nsec;
5870  }
5871  return diff;
5872}
5873
5874/////////////// Unit tests ///////////////
5875
5876#ifndef PRODUCT
5877
5878#define test_log(...)              \
5879  do {                             \
5880    if (VerboseInternalVMTests) {  \
5881      tty->print_cr(__VA_ARGS__);  \
5882      tty->flush();                \
5883    }                              \
5884  } while (false)
5885
5886class TestReserveMemorySpecial : AllStatic {
5887 public:
5888  static void small_page_write(void* addr, size_t size) {
5889    size_t page_size = os::vm_page_size();
5890
5891    char* end = (char*)addr + size;
5892    for (char* p = (char*)addr; p < end; p += page_size) {
5893      *p = 1;
5894    }
5895  }
5896
5897  static void test_reserve_memory_special_huge_tlbfs_only(size_t size) {
5898    if (!UseHugeTLBFS) {
5899      return;
5900    }
5901
5902    test_log("test_reserve_memory_special_huge_tlbfs_only(" SIZE_FORMAT ")", size);
5903
5904    char* addr = os::Linux::reserve_memory_special_huge_tlbfs_only(size, NULL, false);
5905
5906    if (addr != NULL) {
5907      small_page_write(addr, size);
5908
5909      os::Linux::release_memory_special_huge_tlbfs(addr, size);
5910    }
5911  }
5912
5913  static void test_reserve_memory_special_huge_tlbfs_only() {
5914    if (!UseHugeTLBFS) {
5915      return;
5916    }
5917
5918    size_t lp = os::large_page_size();
5919
5920    for (size_t size = lp; size <= lp * 10; size += lp) {
5921      test_reserve_memory_special_huge_tlbfs_only(size);
5922    }
5923  }
5924
5925  static void test_reserve_memory_special_huge_tlbfs_mixed() {
5926    size_t lp = os::large_page_size();
5927    size_t ag = os::vm_allocation_granularity();
5928
5929    // sizes to test
5930    const size_t sizes[] = {
5931      lp, lp + ag, lp + lp / 2, lp * 2,
5932      lp * 2 + ag, lp * 2 - ag, lp * 2 + lp / 2,
5933      lp * 10, lp * 10 + lp / 2
5934    };
5935    const int num_sizes = sizeof(sizes) / sizeof(size_t);
5936
5937    // For each size/alignment combination, we test three scenarios:
5938    // 1) with req_addr == NULL
5939    // 2) with a non-null req_addr at which we expect to successfully allocate
5940    // 3) with a non-null req_addr which contains a pre-existing mapping, at which we
5941    //    expect the allocation to either fail or to ignore req_addr
5942
5943    // Pre-allocate two areas; they shall be as large as the largest allocation
5944    //  and aligned to the largest alignment we will be testing.
5945    const size_t mapping_size = sizes[num_sizes - 1] * 2;
5946    char* const mapping1 = (char*) ::mmap(NULL, mapping_size,
5947      PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE,
5948      -1, 0);
5949    assert(mapping1 != MAP_FAILED, "should work");
5950
5951    char* const mapping2 = (char*) ::mmap(NULL, mapping_size,
5952      PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE,
5953      -1, 0);
5954    assert(mapping2 != MAP_FAILED, "should work");
5955
5956    // Unmap the first mapping, but leave the second mapping intact: the first
5957    // mapping will serve as a value for a "good" req_addr (case 2). The second
5958    // mapping, still intact, as "bad" req_addr (case 3).
5959    ::munmap(mapping1, mapping_size);
5960
5961    // Case 1
5962    test_log("%s, req_addr NULL:", __FUNCTION__);
5963    test_log("size            align           result");
5964
5965    for (int i = 0; i < num_sizes; i++) {
5966      const size_t size = sizes[i];
5967      for (size_t alignment = ag; is_size_aligned(size, alignment); alignment *= 2) {
5968        char* p = os::Linux::reserve_memory_special_huge_tlbfs_mixed(size, alignment, NULL, false);
5969        test_log(SIZE_FORMAT_HEX " " SIZE_FORMAT_HEX " ->  " PTR_FORMAT " %s",
5970                 size, alignment, p2i(p), (p != NULL ? "" : "(failed)"));
5971        if (p != NULL) {
5972          assert(is_ptr_aligned(p, alignment), "must be");
5973          small_page_write(p, size);
5974          os::Linux::release_memory_special_huge_tlbfs(p, size);
5975        }
5976      }
5977    }
5978
5979    // Case 2
5980    test_log("%s, req_addr non-NULL:", __FUNCTION__);
5981    test_log("size            align           req_addr         result");
5982
5983    for (int i = 0; i < num_sizes; i++) {
5984      const size_t size = sizes[i];
5985      for (size_t alignment = ag; is_size_aligned(size, alignment); alignment *= 2) {
5986        char* const req_addr = (char*) align_ptr_up(mapping1, alignment);
5987        char* p = os::Linux::reserve_memory_special_huge_tlbfs_mixed(size, alignment, req_addr, false);
5988        test_log(SIZE_FORMAT_HEX " " SIZE_FORMAT_HEX " " PTR_FORMAT " ->  " PTR_FORMAT " %s",
5989                 size, alignment, p2i(req_addr), p2i(p),
5990                 ((p != NULL ? (p == req_addr ? "(exact match)" : "") : "(failed)")));
5991        if (p != NULL) {
5992          assert(p == req_addr, "must be");
5993          small_page_write(p, size);
5994          os::Linux::release_memory_special_huge_tlbfs(p, size);
5995        }
5996      }
5997    }
5998
5999    // Case 3
6000    test_log("%s, req_addr non-NULL with preexisting mapping:", __FUNCTION__);
6001    test_log("size            align           req_addr         result");
6002
6003    for (int i = 0; i < num_sizes; i++) {
6004      const size_t size = sizes[i];
6005      for (size_t alignment = ag; is_size_aligned(size, alignment); alignment *= 2) {
6006        char* const req_addr = (char*) align_ptr_up(mapping2, alignment);
6007        char* p = os::Linux::reserve_memory_special_huge_tlbfs_mixed(size, alignment, req_addr, false);
6008        test_log(SIZE_FORMAT_HEX " " SIZE_FORMAT_HEX " " PTR_FORMAT " ->  " PTR_FORMAT " %s",
6009                 size, alignment, p2i(req_addr), p2i(p), ((p != NULL ? "" : "(failed)")));
6010        // as the area around req_addr contains already existing mappings, the API should always
6011        // return NULL (as per contract, it cannot return another address)
6012        assert(p == NULL, "must be");
6013      }
6014    }
6015
6016    ::munmap(mapping2, mapping_size);
6017
6018  }
6019
6020  static void test_reserve_memory_special_huge_tlbfs() {
6021    if (!UseHugeTLBFS) {
6022      return;
6023    }
6024
6025    test_reserve_memory_special_huge_tlbfs_only();
6026    test_reserve_memory_special_huge_tlbfs_mixed();
6027  }
6028
6029  static void test_reserve_memory_special_shm(size_t size, size_t alignment) {
6030    if (!UseSHM) {
6031      return;
6032    }
6033
6034    test_log("test_reserve_memory_special_shm(" SIZE_FORMAT ", " SIZE_FORMAT ")", size, alignment);
6035
6036    char* addr = os::Linux::reserve_memory_special_shm(size, alignment, NULL, false);
6037
6038    if (addr != NULL) {
6039      assert(is_ptr_aligned(addr, alignment), "Check");
6040      assert(is_ptr_aligned(addr, os::large_page_size()), "Check");
6041
6042      small_page_write(addr, size);
6043
6044      os::Linux::release_memory_special_shm(addr, size);
6045    }
6046  }
6047
6048  static void test_reserve_memory_special_shm() {
6049    size_t lp = os::large_page_size();
6050    size_t ag = os::vm_allocation_granularity();
6051
6052    for (size_t size = ag; size < lp * 3; size += ag) {
6053      for (size_t alignment = ag; is_size_aligned(size, alignment); alignment *= 2) {
6054        test_reserve_memory_special_shm(size, alignment);
6055      }
6056    }
6057  }
6058
6059  static void test() {
6060    test_reserve_memory_special_huge_tlbfs();
6061    test_reserve_memory_special_shm();
6062  }
6063};
6064
6065void TestReserveMemorySpecial_test() {
6066  TestReserveMemorySpecial::test();
6067}
6068
6069#endif
6070