Liping's Blog

翁曰:「無他,但手熟爾。」

sge jsv hang issue

TL;DR sge代码里使用timeval结构体时,没有初始化成员的值,导致timeout时间非常大,jsv进程一直没有timeout退出。

问题描述

用户反馈sge的jsv进程一直卡死,如下面所示,15827、14992、15109、8805等进程应该很快就会结束,而不是一直运行着

# pstree -p | grep -i jsv2
        |-sjm(402)-+-jsv2.sh(15827)---jsv2.sh(2855)---cut(2857)
        |-sjm(30992)-+-jsv2.sh(14992)---jsv2.sh(2849)---cut(2851)
        |-sjm(31662)-+-jsv2.sh(15109)---jsv2.sh(2846)---cut(2848)
        |-sjm(12019)-+-jsv2.sh(8805)

问题分析

先用pstack看进程的情况

# pstack 12019

... skipped ....

Thread 1 (Thread 0x7f802063d840 (LWP 12019)):
#0  0x0000003a9f4bd79d in nanosleep () from /lib64/libc.so.6
#1  0x00007f80223d5e1e in sge_usleep () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#2  0x00007f80223d135e in sge_peclose () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#3  0x00007f8022333d0e in jsv_stop () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#4  0x00007f80223360d9 in jsv_do_communication () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#5  0x00007f8022334dae in jsv_do_verify () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#6  0x00007f80222efe05 in sge_gdi_packet_execute_external () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#7  0x00007f80222ed007 in sge_gdi2_multi () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#8  0x00007f80222ed6fb in sge_gdi2 () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#9  0x00007f80223df189 in japi_send_job () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#10 0x00007f80223e8292 in japi_run_job () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#11 0x00007f80222d4442 in drmaa_run_job () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#12 0x000000000042c375 in SgeBatchSystem::submit(Job&) ()
#13 0x0000000000424d12 in JobMonitor::submitJob() ()
#14 0x00000000004279e8 in JobMonitor::processEvents() ()
#15 0x0000000000427dad in JobMonitor::submit(std::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool, std::vector<std::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&) ()
#16 0x000000000040a2d0 in main ()

可以看到,Thread 1一直停在nanosleep()进程。

使用gdb attach用户进程(注:由于权限原因,当前系统只能以用户权限去attach)

# sudo -u <user_name> gdb -p 12019
(gdb) bt full
#0  0x0000003a9f4bd79d in nanosleep () from /lib64/libc.so.6
No symbol table info available.
#1  0x00007f80223d5e1e in sge_usleep (sleep_time=<value optimized out>) at ../libs/uti/sge_time.c:475
        req = {tv_sec = 0, tv_nsec = 100000000}
        rem = {tv_sec = 0, tv_nsec = 95651751}
#2  0x00007f80223d135e in sge_peclose (pid=8805, fp_in=<value optimized out>, fp_out=<value optimized out>, fp_err=<value optimized out>,
    timeout=0x7ffda3d2ce80) at ../libs/uti/sge_stdio.c:681
        sleep_for = 100000
        i = <value optimized out>
        timeout_us = 140099372198732
        SGE_FUNC = "sge_peclose"
#3  0x00007f8022333d0e in jsv_stop (jsv=0xa862b0, answer_list=<value optimized out>, try_soft_quit=<value optimized out>) at ../libs/sgeobj/sge_jsv.c:393
        scriptfile = 0xa7d810 "/opt/gridengine/default/common/jsv2.sh"
        t = {tv_sec = 0, tv_usec = 140188306598732}
        pid = 8805
        SGE_FUNC = "jsv_stop"
#4  0x00007f80223360d9 in jsv_do_communication (ctx=0xa75270, jsv=0xa862b0, answer_list=<value optimized out>) at ../libs/sgeobj/sge_jsv_script.c:2574
        start_time = 1588999988
        do_retry = true
        jsv_timeout = 10
        ret = true
        input = "STARTED\000TATE ACCEPT Job is accepted", '\000' <repeats 197 times>, "Ч{\237:\000\000\000\321.-\\\002\200\377\377\060\321ң\375\177\000\000/\321ң\375\177", '\000' <repeats 82 times>"\200, \247{\237:\000\000\000\020\004\000\000\000\000\000\000\000\004\000\000\000\000\000\000\020T\250\000\000\000\000\000@3\252\000\000\000\000\000\b\a\001\000\000\000\000\000y\344G\237:\000\000\000\000\004\000\000\000\000\000\000\361--\\\002\200\377\377A\000\000\000\000\000\000\000\020\000\000\000\000\000\000\000@\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\020\322ң\375\177\000\000\000\000\000\000\000\000\000\000]\000\000\000n", '\000' <repeats 19 times>, "w\000\000\000|\000\000\000"...
        SGE_FUNC = "jsv_do_communication"
#5  0x00007f8022334dae in jsv_do_verify (ctx=0xa75270, context=<value optimized out>, job=0x7ffda3d2f8e8, answer_list=0x7ffda3d2fab8, holding_lock=false)
    at ../libs/sgeobj/sge_jsv.c:1095
        old_job = <value optimized out>
        new_job = 0xa9b770
        jsv_url = 0x0
        holding_mutex = <value optimized out>
        ret = <value optimized out>
        jsv = 0xa862b0
        jsv_next = 0x0
        iterator = 0x0
        SGE_FUNC = "jsv_do_verify"

 ... skipped ...

从frame 3可以看到jsv执行jsv_stop()去停止jsv实例,代码里函数的描述是这样的

// source/libs/sgeobj/sge_jsv.c

/****** sge_jsv/jsv_stop() *****************************************************
*  NAME
*     jsv_stop() -- Stop a JSV instance which was previously started-
*
*  SYNOPSIS
*     bool jsv_stop(lListElem *jsv, lList **answer_list, bool try_soft_quit)-
*
*  FUNCTION
*     Stop a running JSV instance which was previously started. If the
*     variable 'try_soft_quit' is set to 'true' then this function tries
*     to communicate with the process. It will then send a "QUIT" string
*     to pipe end which is connected with the stdin of the JSV process.
*     If the script does then not terminate within JSV_QUIT_TIMEOUT-
*     seconds then it will be terminated hard via kill signal.

下面是实际代码部分

// source/libs/sgeobj/sge_jsv.c

bool
jsv_stop(lListElem *jsv, lList **answer_list, bool try_soft_quit) {
   bool ret = true;
   pid_t pid = -1;

   DENTER(TOP_LAYER, "jsv_stop");

   /* stop is only possible if it was started before */
   pid = jsv_get_pid(jsv);
   if (pid != -1) {
      const char *scriptfile = lGetString(jsv, JSV_command);
      struct timeval t;

      /*
       * send a quit command to stop the JSV softly.
       * give it then JSV_QUIT_TIMEOUT seconds before
       * the kill is triggered (sge_peclose)
       */
      if (try_soft_quit) { // 大部分情况下,都是try_soft_quit=true,发送QUIT给jsv实例
         jsv_send_command(jsv, answer_list, "QUIT");
         t.tv_sec = JSV_QUIT_TIMEOUT;
      } else { // 但这里的情况是不发送QUIT,下面会详细说明
         t.tv_sec = 0;
      }

      /*
       * kill the JSV process
       */
      sge_peclose(pid, lGetRef(jsv, JSV_in), // sge_peclose会发送SIGKILL给jsv实例
                  lGetRef(jsv, JSV_out), lGetRef(jsv, JSV_err), &t);

      INFO((SGE_EVENT, MSG_JSV_STOPPED_S, scriptfile));

      jsv_set_pid(jsv, -1);
   }
   DRETURN(ret);
}

在frame 3中,有几个重要信息。pid 8805就是我们认为应该运行很长时间的jsv脚本/opt/gridengine/default/common/jsv2.sh,而t是传给sge_peclose()函数的timeout参数。

scriptfile = 0xa7d810 "/opt/gridengine/default/common/jsv2.sh"
t = {tv_sec = 0, tv_usec = 140188306598732}
pid = 8805

再看frame 2里sge_peclose()函数的代码,前面部分首先是初始化timeout_us的值。这里timeout参数就是上面看到的,timeout->tv_sec=0,timeout->tv_usec=140188306598732,这是一个很大的数

// source/libs/uti/sge_stdio.c

int sge_peclose(pid_t pid, FILE *fp_in, FILE *fp_out, FILE *fp_err,
                struct timeval *timeout)
{
   int i, status;
   long timeout_us = timeout ? (timeout->tv_sec * 1000000L + timeout->tv_usec)
                             : 0; // 初始值timeout_us = timeout->tv_usec

   DENTER(TOP_LAYER, "sge_peclose");

   if (fp_in != NULL) { // 到jsv实例的输入、输出都关闭了
      FCLOSE(fp_in);
   }
   if (fp_out != NULL) {
      FCLOSE(fp_out);
   }
   if (fp_err != NULL) {
      FCLOSE(fp_err);
   }

由于输入、输出关闭,strace jsv实例,会看到实例在发送数据的时候报EPIPE错误

# strace -f -p 8805 -e write 2>&1 -s 128 | egrep '^write'
write(1, "ERROR JSV script got unknown command \"\"\n", 40) = -1 EPIPE (Broken pipe)
write(1, "ERROR JSV script got unknown command \"\"\n", 40) = -1 EPIPE (Broken pipe)
write(1, "ERROR JSV script got unknown command \"\"\n", 40) = -1 EPIPE (Broken pipe)

这对应的是${SGE_ROOT}/util/resources/jsv/jsv_include.sh脚本里1723行代码,有兴趣可以了解一下,这里不展开。(注:当前系统sge版本为Son of Grid Engine最新版本)

接着说sge_peclose()函数。这里是一个while循环。

// source/libs/uti/sge_stdio.c

   do {
      errno = 0;
      i = waitpid(pid, &status, timeout?WNOHANG:0); // wait4系统调用
      if (i==-1) {
         DEXIT;
         return errno;
      }
      if (i==0) { /* not yet exited */
         if (timeout_us <= 0) { // timeout_us是一个非常大的值
#ifdef WIN32 /* kill not called */
            DPRINTF(("killing not yet implemented\n"));
            timeout = NULL;
            /* kill(pid, SIGKILL); */
#else
            DPRINTF(("killing\n"));
            timeout = NULL;
            kill(pid, SIGKILL);
#endif /* WIN32 */
         } else {
            /* Spin less for longer timeouts; for timeout > 1s, sleep
               for 0.1s, n the loop, otherwise 1ms.  */
            int sleep_for = (timeout_us > 1000000) ? 100000 : 1000;

            DPRINTF(("%f seconds waiting for exit\n",
                     timeout->tv_sec + timeout->tv_usec / 1e6));
            timeout_us -= sleep_for; // timeout_us在这里调低,但要降到<=0需时很长
            sge_usleep(sleep_for); // nanosleep系统调用
         }
      }
   } while (i != pid);

如果直接去strace 12019这个用户程序的进程,可以看到在进行wait4()nanosleep()两个系统调用,对应的就是上面的waitpid()sge_usleep()在进行底层的调用。

# strace -p 12019
Process 12019 attached
restart_syscall(<... resuming interrupted call ...>) = 0
wait4(8805, 0x7ffda3d2ce3c, WNOHANG, NULL) = 0
nanosleep({0, 100000000}, 0x7ffda3d2cdd0) = 0
wait4(8805, 0x7ffda3d2ce3c, WNOHANG, NULL) = 0
nanosleep({0, 100000000}, 0x7ffda3d2cdd0) = 0
wait4(8805, 0x7ffda3d2ce3c, WNOHANG, NULL) = 0
nanosleep({0, 100000000}, 0x7ffda3d2cdd0) = 0
wait4(8805, 0x7ffda3d2ce3c, WNOHANG, NULL) = 0

到这里我们就知道进程为什么不退出了,因为timeout_us是一个很大的值,不小于等于0,所以sge_peclose()函数并没有执行kill(pid, SIGKILL),而是不断在执行下面这两步。

            timeout_us -= sleep_for;
            sge_usleep(sleep_for);

从gdb可以看到timeout_us的值要减到小于等于0是要很长时间的。

(gdb) frame 2
#2  0x00007f80223d135e in sge_peclose (pid=8805, fp_in=<value optimized out>, fp_out=<value optimized out>, fp_err=<value optimized out>,
    timeout=0x7ffda3d2ce80) at ../libs/uti/sge_stdio.c:681
681                 sge_usleep(sleep_for);
(gdb) info locals
sleep_for = 100000
i = <value optimized out>
timeout_us = 140099372198732
SGE_FUNC = "sge_peclose"
  1. 为什么会出现这个问题?

主要是因为jsv_stop()函数里t这个值没有初始化,代码里只设置了t.tv_sec,没有设置t.tv_usec这个值,这会导致不确定的结果,也就是这里看到的超大值。

// linux/latest/source/include/uapi/linux/time.h

struct timeval {
    __kernel_old_time_t tv_sec;     /* seconds */
    __kernel_suseconds_t    tv_usec;    /* microseconds */
};


// source/libs/sgeobj/sge_jsv.c

      struct timeval t;

      /*
       * send a quit command to stop the JSV softly.
       * give it then JSV_QUIT_TIMEOUT seconds before
       * the kill is triggered (sge_peclose)
       */
      if (try_soft_quit) { // 大部分情况下,都是try_soft_quit=true,发送QUIT给jsv实例
         jsv_send_command(jsv, answer_list, "QUIT");
         t.tv_sec = JSV_QUIT_TIMEOUT;
      } else { // 但这里的情况是不发送QUIT,下面会详细说明
         t.tv_sec = 0;
      }
  1. 为什么有些作业有问题,有些作业有问题?

我们再去看成backtrace的输出,frame 4是在../libs/sgeobj/sge_jsv_script.c:2574这一行,do_retry = truejsv_timeout = 10

#4  0x00007f80223360d9 in jsv_do_communication (ctx=0xa75270, jsv=0xa862b0, answer_list=<value optimized out>) at ../libs/sgeobj/sge_jsv_script.c:2574
        start_time = 1588999988
        do_retry = true
        jsv_timeout = 10
        ret = true
        input = "STARTED\000TATE ACCEPT Job is accepted", '\000' <repeats 197 times>, "Ч{\237:\000\000\000\321.-\\\002\200\377\377\060\321ң\375\177\000\000/\321ң\375\177", '\000' <repeats 82 times>"\200, \247{\237:\000\000\000\020\004\000\000\000\000\000\000\000\004\000\000\000\000\000\000\020T\250\000\000\000\000\000@3\252\000\000\000\000\000\b\a\001\000\000\000\000\000y\344G\237:\000\000\000\000\004\000\000\000\000\000\000\361--\\\002\200\377\377A\000\000\000\000\000\000\000\020\000\000\000\000\000\000\000@\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\020\322ң\375\177\000\000\000\000\000\000\000\000\000\000]\000\000\000n", '\000' <repeats 19 times>, "w\000\000\000|\000\000\000"...
        SGE_FUNC = "jsv_do_communication"

对着代码看,要执行2574这行,有两个条件:

  • sge_get_gmt() - start_time > jsv_timeout,就是jsv运行时长已经超过jsv_timeout时间还没有结束(注:默认10秒)
  • 超时后第一次do_retry。这是第一次也是唯一一次重试的机会。
// source/libs/sgeobj/sge_jsv_script.c

2559       while (!lGetBool(jsv, JSV_done)) { // 不断重试,间隔10000微秒
2560          if (sge_get_gmt() - start_time > jsv_timeout) { // 直至超时执行下面的代码
2561             DPRINTF(("JSV - master waited longer than %d s to get response from JSV\n", jsv_timeout));
2562             /*
2563              * In case of a timeout we try it a second time. In that case we kill
2564              * the old instance and start a new one before we continue
2565              * with the verification. Otherwise we report an error which will
2566              * automatically reject the job which should be verified.
2567              */
2568             if (do_retry) { // do_retry初始值是true,这里会重试一次
2569                DPRINTF(("JSV - will retry verification\n"));-
2570                lSetBool(jsv, JSV_restart, false);
2571                lSetBool(jsv, JSV_accept, false);
2572                lSetBool(jsv, JSV_done, false);
2573                DPRINTF(("JSV process will be stopped now\n"));
2574                ret &= jsv_stop(jsv, answer_list, false); // 卡在这里,最后参数为false,导致不发送QUIT给jsv
2575                if (ret) {
2576                   DPRINTF(("JSV process will be started now\n"));
2577                   ret &= jsv_start(jsv, answer_list);
2578                }
2579                if (ret) {
2580                   DPRINTF(("JSV process gets START command\n"));
2581                   while (fscanf(lGetRef(jsv, JSV_err), "%[^\n]\n", input) == 1) {
2582                      ERROR((SGE_EVENT, MSG_JSV_LOGMSG_S, input));--
2583                   }
2584                   ret &= jsv_send_command(jsv, answer_list, "START");
2585                }
2586                start_time = sge_get_gmt();
2587                do_retry = false;

所以,如果jsv执行得快的话,就不会有问题。目前发现有问题的都是通过drmaa投递的,猜测可能的情况时,自动化脚本短时间内会投递大量作业,导致性能问题,jsv执行慢,出现超时现象,然后导致这些作业投递失败。

解决方案

纵观代码,总共有两处地方传递了timeout参数给sge_peclose(),而没有初始化tv_usec的值。代码本来并没有想着要用tv_usec的值,但commit 227a8445修改了sge_peclose()函数sleep代码的逻辑,使用了tv_usec的值。

所以,代码最终修改如下,涉及两个文件:

# diff -p -r sge-master.orig/ sge-master/
diff -p -r sge-master.orig/source/daemons/execd/sge_load_sensor.c sge-master/source/daemons/execd/sge_load_sensor.c
*** sge-master.orig/source/daemons/execd/sge_load_sensor.c      2019-08-13 00:54:13.000000000 +0800
--- sge-master/source/daemons/execd/sge_load_sensor.c   2020-05-11 10:31:58.928879447 +0800
*************** static void sge_ls_stop_ls(lListElem *th
*** 385,390 ****
--- 385,396 ----
        ret = LS_BROKEN_PIPE;
     }

+    /* fix uninitialized t.tv_usec cause undefined behaviour bug.
+    * commit 227a8445 replace sleep() with higher resolution sge_usleep().
+    * it use not only t.tv_sec, but also t.tv_usec.
+    */
+    t.tv_usec = 0;
+
     memset(&t, 0, sizeof(t));
     if (ret == LS_OK) {
        t.tv_sec = LS_QUIT_TIMEOUT;
diff -p -r sge-master.orig/source/libs/sgeobj/sge_jsv.c sge-master/source/libs/sgeobj/sge_jsv.c
*** sge-master.orig/source/libs/sgeobj/sge_jsv.c        2019-08-13 00:54:13.000000000 +0800
--- sge-master/source/libs/sgeobj/sge_jsv.c     2020-05-11 10:28:46.054555752 +0800
*************** jsv_stop(lListElem *jsv, lList **answer_
*** 375,380 ****
--- 375,386 ----
        const char *scriptfile = lGetString(jsv, JSV_command);
        struct timeval t;

+       /* fix uninitialized t.tv_usec cause undefined behaviour bug.
+        * commit 227a8445 replace sleep() with higher resolution sge_usleep().
+        * it use not only t.tv_sec, but also t.tv_usec.
+        */
+       t.tv_usec = 0;
+
        /*
         * send a quit command to stop the JSV softly.
         * give it then JSV_QUIT_TIMEOUT seconds before

重新编译sge,并替换libdrmaa.so.1.0文件。

总结

sge代码的bug是主要的原因,但jsv的效率也需要做一下性能测试。理论上目前jsv脚本的操作很简单,不应该耗时长,但还是要有实际数据来佐证。

Tech sge

Comments