Liping's Blog

翁曰:「無他,但手熟爾。」

用gdb deubg bash输出不正常问题

TL;DR bash参数扩展导致命令输出跟预期结果不一致。

问题描述

用户反馈echo或者printf命令如下,当有1或者g的时候,只会输出1或/和g,没有其他数据,怀疑系统有问题。

$ printf [3jnrw:mvlewgtf09u23434e12ermld234r25cscfw]
1

$ echo [3jnrw:mvlewgtf09u23434e12ermld234r25cscfw]
1 g

$ echo [adfaasdljfalsjljajzpqwekzv]
[adfaasdljfalsjljajzpqwekzv]

$ uname -r
2.6.32-504.16.2.el6.x86_64

问题分析

使用strace命令去跟踪,发现bash在执行execve系统调用的时候,传递给echo的就是只有1和g。问题应该在bash。

# strace -f echo [aaag1bbb]
execve("/bin/echo", ["echo", "1", "g"], [/* 53 vars */]) = 0

接下来用gdb来分析bash的行为。由于只知道bash在执行execve的时候,传递给echo命令的参数不对,所以只设置捕获execve系统调用。

# gdb --args /bin/bash -c '/bin/echo [g1]'

(gdb) catch syscall execve
Catchpoint 1 (syscall 'execve' [59])

运行程序,查看断点的backtrace信息

(gdb) r
Starting program: /bin/bash -c /bin/echo\ \[g1\]

Catchpoint 1 (call to syscall 'execve'), 0x0000003047ebdb37 in execve () from /lib64/libc.so.6
Missing separate debuginfos, use: debuginfo-install ncurses-libs-5.7-4.20090207.el6.x86_64

(gdb) bt
#0  0x0000003047ebdb37 in execve () from /lib64/libc.so.6
#1  0x000000000042e156 in shell_execve (command=0x8ee0e0 "/bin/echo", args=0x8eed00, env=0x8ee9e0) at execute_cmd.c:4816
#2  0x000000000042fade in execute_disk_command (simple_command=<value optimized out>, pipe_in=-1, pipe_out=-1, async=<value optimized out>,
    fds_to_close=<value optimized out>) at execute_cmd.c:4616
#3  execute_simple_command (simple_command=<value optimized out>, pipe_in=-1, pipe_out=-1, async=<value optimized out>, fds_to_close=<value optimized out>)
    at execute_cmd.c:3863
#4  0x0000000000430463 in execute_command_internal (command=0x8ee6f0, asynchronous=<value optimized out>, pipe_in=-1, pipe_out=-1, fds_to_close=0x8ee740)
    at execute_cmd.c:741
#5  0x000000000046b86b in parse_and_execute (string=<value optimized out>, from_file=0x4a0fb8 "-c", flags=<value optimized out>) at evalstring.c:339
#6  0x000000000041bada in run_one_command (command=0x7fffffffdfef "/bin/echo [g1]") at shell.c:1325
#7  0x000000000041cbab in main (argc=<value optimized out>, argv=0x7fffffffdcb8, env=0x7fffffffdcd8) at shell.c:698

先看frame 1的信息,发现shell_execve()函数的参数args是有问题的。不应该是1g,而应该是[g1]

(gdb) frame 1
#1  0x000000000042e156 in shell_execve (command=0x8ee0e0 "/bin/echo", args=0x8eed00, env=0x8ee9e0) at execute_cmd.c:4816
4816      execve (command, args, env);

(gdb) list execute_cmd.c:4816
4811      int larray, i, fd;
4812      char sample[80];
4813      int sample_len;
4814
4815      SETOSTYPE (0);                /* Some systems use for USG/POSIX semantics */
4816      execve (command, args, env);
4817      i = errno;                    /* error from execve() */
4818      CHECK_TERMSIG;
4819      SETOSTYPE (1);
4820

(gdb) set listsize 20

(gdb) list execute_cmd.c:4816
4806    int
4807    shell_execve (command, args, env)
4808         char *command;
4809         char **args, **env;
4810    {
4811      int larray, i, fd;
4812      char sample[80];
4813      int sample_len;
4814
4815      SETOSTYPE (0);                /* Some systems use for USG/POSIX semantics */
4816      execve (command, args, env);
4817      i = errno;                    /* error from execve() */
4818      CHECK_TERMSIG;
4819      SETOSTYPE (1);
4820
4821      /* If we get to this point, then start checking out the file.
4822         Maybe it is something we can hack ourselves. */
4823      if (i != ENOEXEC)
4824        {
4825          if (file_isdir (command))

(gdb) p args
$2 = (char **) 0x8eed00

(gdb) p *args
$3 = 0x8ee880 "/bin/echo"

(gdb) p *args@4
$8 = {0x8ee880 "/bin/echo", 0x8ee940 "1", 0x8ee100 "g", 0x0}

再往上看frame 2,传递的args参数被optimized out了,看不了。但可以看到args是由words来构成的,而words在这里是有问题的。

(gdb) frame 2
#2  0x000000000042fade in execute_disk_command (simple_command=<value optimized out>, pipe_in=-1, pipe_out=-1, async=<value optimized out>,
    fds_to_close=<value optimized out>) at execute_cmd.c:4616
4616          exit (shell_execve (command, args, export_env));

(gdb) p args
$11 = <value optimized out>

(gdb) list execute_cmd.c:4616
4606                }
4607
4608              wl = make_word_list (make_word (NOTFOUND_HOOK), words);
4609              exit (execute_shell_function (hookf, wl));
4610            }
4611
4612          /* Execve expects the command name to be in args[0].  So we
4613             leave it there, in the same format that the user used to
4614             type it in. */
4615          args = strvec_from_word_list (words, 0, 0, (int *)NULL);
4616          exit (shell_execve (command, args, export_env));
4617        }
4618      else
4619        {
4620    parent_return:
4621          /* Make sure that the pipes are closed in the parent. */
4622          close_pipes (pipe_in, pipe_out);
4623    #if defined (PROCESS_SUBSTITUTION) && defined (HAVE_DEV_FD)
4624          if (variable_context == 0)
4625            unlink_fifo_list ();

(gdb) p words
$25 = (WORD_LIST *) 0x8ee960

(gdb) p *words
$26 = {next = 0x8ee8a0, word = 0x8ebf30}

(gdb) p *words->word
$27 = {word = 0x8ee880 "/bin/echo", flags = 0}

(gdb) p *words->next->word
$28 = {word = 0x8ee940 "1", flags = 8}

(gdb) p *words->next->next->word
$29 = {word = 0x8ee100 "g", flags = 8}

接着看frame 3,在执行execute_disk_command()函数的时候,words参数也是有问题的。

(gdb) frame 3
#3  execute_simple_command (simple_command=<value optimized out>, pipe_in=-1, pipe_out=-1, async=<value optimized out>, fds_to_close=<value optimized out>)
    at execute_cmd.c:3863
3863      execute_disk_command (words, simple_command->redirects, command_line,

(gdb) list execute_cmd.c:3863
3853        }
3854
3855      if (command_line == 0)
3856        command_line = savestring (the_printed_command_except_trap);
3857
3858    #if defined (PROCESS_SUBSTITUTION)
3859      if ((subshell_environment & SUBSHELL_COMSUB) && (simple_command->flags & CMD_NO_FORK) && fifos_pending() > 0)
3860        simple_command->flags &= ~CMD_NO_FORK;
3861    #endif
3862
3863      execute_disk_command (words, simple_command->redirects, command_line,
3864                            pipe_in, pipe_out, async, fds_to_close,
3865                            simple_command->flags);
3866
3867     return_result:
3868      bind_lastarg (lastarg);
3869      FREE (command_line);
3870      dispose_words (words);
3871      discard_unwind_frame ("simple-command");
3872      this_command_name = (char *)NULL;     /* points to freed memory now */

(gdb) p words
$31 = (WORD_LIST *) 0x8ee960
(gdb) p *words
$33 = {next = 0x8ee8a0, word = 0x8ebf30}
(gdb) p *words->word
$34 = {word = 0x8ee880 "/bin/echo", flags = 0}
(gdb) p *words->next->word
$35 = {word = 0x8ee940 "1", flags = 8}
(gdb) p *words->next->next->word
$36 = {word = 0x8ee100 "g", flags = 8}

(gdb) p command_line
$32 = 0x8ee9a0 "/bin/echo [g1]"

再看frame 4,可以看到传给execute_simple_command()函数的参数是对的。也就是说在执行execute_simple_command()函数的时候,中间某一步words的值变了,然后对执行execute_disk_command()函数的时候就开始不对了。

(gdb) frame 4
#4  0x0000000000430463 in execute_command_internal (command=0x8ee6f0, asynchronous=<value optimized out>, pipe_in=-1, pipe_out=-1, fds_to_close=0x8ee740)
    at execute_cmd.c:741
741               execute_simple_command (command->value.Simple, pipe_in, pipe_out,

(gdb) list execute_cmd.c:741
731             last_pid = last_made_pid;
732             was_error_trap = signal_is_trapped (ERROR_TRAP) && signal_is_ignored (ERROR_TRAP) == 0;
733
734             if (ignore_return && command->value.Simple)
735               command->value.Simple->flags |= CMD_IGNORE_RETURN;
736             if (command->flags & CMD_STDIN_REDIR)
737               command->value.Simple->flags |= CMD_STDIN_REDIR;
738
739             line_number_for_err_trap = line_number = command->value.Simple->line;
740             exec_result =
741               execute_simple_command (command->value.Simple, pipe_in, pipe_out,
742                                       asynchronous, fds_to_close);
743             line_number = save_line_number;
744
745             /* The temporary environment should be used for only the simple
746                command immediately following its definition. */
747             dispose_used_env_vars ();
748
749     #if (defined (ultrix) && defined (mips)) || defined (C_ALLOCA)
750             /* Reclaim memory allocated with alloca () on machines which */

(gdb) p command
$37 = (COMMAND *) 0x8ee6f0
(gdb) p *command
$38 = {type = cm_simple, flags = 64, line = 0, redirects = 0x0, value = {For = 0x8ee720, Case = 0x8ee720, While = 0x8ee720, If = 0x8ee720,
    Connection = 0x8ee720, Simple = 0x8ee720, Function_def = 0x8ee720, Group = 0x8ee720, Select = 0x8ee720, Arith = 0x8ee720, Cond = 0x8ee720,
    ArithFor = 0x8ee720, Subshell = 0x8ee720, Coproc = 0x8ee720}}
(gdb) p *command->Simple
There is no member named Simple.
(gdb) p *command->value.Simple
$39 = {flags = 64, line = 0, words = 0x8ec0e0, redirects = 0x0}
(gdb) p *command->value.Simple->words
$40 = {next = 0x8ec0c0, word = 0x8ee670}
(gdb) p *command->value.Simple->words->word
$41 = {word = 0x8ee690 "/bin/echo", flags = 0}
(gdb) p *command->value.Simple->words->next->word
$42 = {word = 0x8ee6d0 "[g1]", flags = 0}

上面最简单的定位部分就完成了,下面是对execute_simple_command()函数进行断点,一步步分析。 其中的过程就是在断点开始,对着代码,用n命令一直下一步,用p命令看words这个数据有没有变化。重复的步骤就不啰嗦了。

(gdb) break execute_simple_command
Breakpoint 2 at 0x42eb80: file execute_cmd.c, line 3540.

说重点,在bash-4.1/execute_cmd.c:3656这行words的值是有变化的,但值也是被optimized out了,执行到3667行的时候,可以看到words变了。所以要step in看看expand_words做了什么。

3657          if (simple_command->flags & CMD_IGNORE_RETURN)
(gdb)
3656          words = expand_words (simple_command->words);
(gdb) p *words->next->word
value has been optimized out
(gdb) n
3657          if (simple_command->flags & CMD_IGNORE_RETURN)
(gdb)
3667      if (words == 0)
(gdb) p *words->next->word
$1 = {word = 0x8ee940 "1", flags = 8}

这里list是expand_words()传进来的值,而new_list是返回的值,到这一步还是正确的。

Breakpoint 2, execute_simple_command (simple_command=0x8ee720, pipe_in=-1, pipe_out=-1, async=<value optimized out>, fds_to_close=<value optimized out>)
    at execute_cmd.c:3656
3656          words = expand_words (simple_command->words);
(gdb) s
expand_words (list=0x8ec0e0) at subst.c:8748
8748      return (expand_word_list_internal (list, WEXP_ALL));
(gdb) n
expand_word_list_internal (list=0x8ec0e0, eflags=31) at subst.c:9074
9074    {
(gdb)
9078      if (list == 0)
(gdb)
9074    {
(gdb)
9078      if (list == 0)
(gdb)
9081      garglist = new_list = copy_word_list (list);
(gdb)
9082      if (eflags & WEXP_VARASSIGN)
(gdb)
9084          garglist = new_list = separate_out_assignments (new_list);
(gdb)
9126      new_list = shell_expand_word_list (new_list, eflags);
(gdb) p *list->next->word
value has been optimized out
(gdb) p *new_list->next->word
$4 = {word = 0x8ee880 "[g1]", flags = 0}

这里有个问题是,很多时候由于编译器优化的原因,不能获取值。比如expand_words()执行完了,值也变了,我们没看到是哪一步变的。这时候只能在这个函数从头再来一次,再step into更底层次的函数里。

9134            new_list = glob_expand_word_list (new_list, eflags);
(gdb) p *new_list->next->word
value has been optimized out
(gdb) n
9140      if ((eflags & WEXP_VARASSIGN) && subst_assign_varlist)
(gdb)
9186    }
(gdb)
execute_simple_command (simple_command=0x8ee720, pipe_in=-1, pipe_out=-1, async=<value optimized out>, fds_to_close=<value optimized out>)
    at execute_cmd.c:3657
3657          if (simple_command->flags & CMD_IGNORE_RETURN)
(gdb)

当然,我们从代码上可以看出,修改的地方可能有下面几个,这些都是跟Bash - Shell Expansions相关的。看文档比看代码里的字符操作要更便捷一些。

// bash-4.1/subst.c

new_list = brace_expand_word_list (new_list, eflags);
new_list = shell_expand_word_list (new_list, eflags);
new_list = glob_expand_word_list (new_list, eflags);
new_list = dequote_list (new_list);

其实用户的问题是Filename Expansion - Pattern Matching问题,bash会先把[3jnrw:mvlewgtf09u23434e12ermld234r25cscfw]去做文件名匹配。由于当前目录下有文件1g,所以,一大串字符会扩展成1g再传给echo命令,结果就是输出1g。如果用引号引起来,就没有这个问题了。或者去别的没有这两个文件的目录,也不会有问题。

$ ls -l 1 g
-rw-r--r-- 1 root root 7284 Apr 20 16:44 1
-rw-r--r-- 1 root root  169 Apr 16 14:42 g

$ echo [3jnrw:mvlewgtf09u23434e12ermld234r25cscfw]
1 g

$ echo '[3jnrw:mvlewgtf09u23434e12ermld234r25cscfw]'
[3jnrw:mvlewgtf09u23434e12ermld234r25cscfw]

$ cd /tmp
$ echo [3jnrw:mvlewgtf09u23434e12ermld234r25cscfw]
[3jnrw:mvlewgtf09u23434e12ermld234r25cscfw]

总结

费了这么大周折来搞明白这个问题,想想真有点傻。Filename Expansion虽然平时也有用,但就是没想起来,导致花费了不少的时间。有时候真应该多想想,三思而后行,而不是上来就蛮干。

Comments