Sunday, April 28, 2013

Design of Mesa 3D Part 8: Shader Compilation Example

So I've discussed how shaders get compiled, but I wanted to give a concrete example showing the stages a shader goes through. Here's the simple shader:


#version 120

float dontinlineme(float x) {
  while (x > 0) {
    if ((x / 5) * 5 == x) {
      return x;
    }   
    x--;
  }
  return x;
}
float inlineme(float x) {
  return (x / 5) * 5;
}

attribute float attr;

void main() {
  gl_Position = vec4(0.0, 0.0, 0.0, 1.0);
  float x = dontinlineme(attr);
  float y = inlineme(attr+1.0);
  if (x == y) {
    gl_Position.yz = vec2(float(x), float(y));
  } else {
    gl_Position.yxz = vec3(0.4, 0.5, 0.6);
  }
}

Here's the AST that is created that represents the shader:


FUNCTION main ( scope=0x80547b680
) param scope = 0x80547b680
{ locals=0x80547b710  outer=0x80547b680
   //child 0 of 4:
   EXPR:  locals=0x80547b788 outer=0x80547b680
      ASSIGNMENT  locals=0x80547c0a8 outer=0x80547b680
         VAR' gl_Position  (in scope 0x7fffffffacb8) locals=0x80547b878 outer=0x80547b680
      := at 0x80547b7a0 locals=0x80547c0a8 outer=0x80547b680
         LITERAL (0.000000 0.000000 0.000000 1.000000 )
   //child 1 of 4:
   { locals=0x80547c270  outer=0x80547b680
      //child 0 of 1:
      DECL (locals=0x80547c528 outer=0x80547b680) float x (0x80547c290) (in scope 0x80547b680)  := INITIALIZER
            COMMA-SEQ  at 0x80547c2e8 locals=0x805480ca0 outer=0x80547b680
            //child 0 of 3:
            DECL (locals=0x805480dd8 outer=0x805480ca0) float __resultTmp (0x805480e28) (in scope 0x805480ca0) ;
            //child 1 of 3:
            {{ // new scope  locals=0x805480ea0 outer=0x805479098: x 
               //child 0 of 4:
               DECL (locals=0x805482910 outer=0x805480ea0) float x (0x8054829c8) (in scope 0x805480ea0)  :=
                  VAR' attr  (in scope 0x7fffffffad10) locals=0x8054829a0 outer=0x80547b680
               //child 1 of 4:
               WHILE LOOP: locals = 0x805480fa8
                  WHILE cond:
                     EXPR:  locals=0x8054810b0 outer=0x805480fa8
                           VAR' x  (in scope 0x805480ea0) locals=0x805481248 outer=0x805480fa8
                        > at 0x8054810c8 locals=0x805481140 outer=0x805480fa8
                           LITERAL (0 )
                  WHILE body:
                     {{ // new scope  locals=0x805481298 outer=0x805480fa8: 
                        //child 0 of 2:
                        IF
                              ASM: vec4_multiply at 0x805481538 locals=0x8054853a0 outer=0x8038e47c0
                                 ASM  at 0x805481538 locals=0x8054853a0 outer=0x8038e47c0
                                 //child 0 of 2:
                                    COMMA-SEQ  at 0x8054853b8 locals=0x8054857c0 outer=0x805481298
                                    //child 0 of 3:
                                    DECL (locals=0x8054858f8 outer=0x8054857c0) float __resultTmp (0x805485948) (in scope 0x8054857c0) ;
                                    //child 1 of 3:
                                    {{ // new scope  locals=0x8054859a0 outer=0x8038e7b68: b bInv 
                                       //child 0 of 5:
                                       DECL (locals=0x805486170 outer=0x8054859a0) const float b (0x805486228) (in scope 0x8054859a0)  :=
                                          LITERAL (5 )
                                       //child 1 of 5:
                                       { locals=0x805485b20  outer=0x8054859a0
                                          //child 0 of 1:
                                          DECL (locals=0x805485bb0 outer=0x8054859a0) float bInv (0x805486290) (in scope 0x8054859a0) ;
                                       }
                                       //child 2 of 5:
                                       ASM: float_rcp at 0x8054863a8 locals=0x805485bd8 outer=0x8054859a0
                                          ASM  at 0x8054863a8 locals=0x805485bd8 outer=0x8054859a0
                                          //child 0 of 2:
                                          FIELD x of
                                             VAR' bInv  (in scope 0x8054859a0) locals=0x805485d70 outer=0x8054859a0
                                          //child 1 of 2:
                                          VAR' b  (in scope 0x8054859a0) locals=0x805485d98 outer=0x8054859a0
                                       //child 3 of 5:
                                       ASM: vec4_multiply at 0x805486408 locals=0x805485dc0 outer=0x8054859a0
                                          ASM  at 0x805486408 locals=0x805485dc0 outer=0x8054859a0
                                          //child 0 of 3:
                                          VAR' __resultTmp  (in scope 0x8054857c0) locals=0x805485fb0 outer=0x8054857c0
                                          //child 1 of 3:
                                          VAR' x  (in scope 0x805480ea0) locals=0x805485fd0 outer=0x805481298
                                          //child 2 of 3:
                                          VAR' bInv  (in scope 0x8054859a0) locals=0x805485f80 outer=0x8054859a0
                                       //child 4 of 5:
                                       LABEL (null)
                                    }}
                                    //child 2 of 3:
                                    VAR __resultTmp  (in scope 0x8054857c0)
                                 //child 1 of 2:
                                 LITERAL (5 )
                           == at 0x8054813b8 locals=0x805481520 outer=0x805481298
                              VAR' x  (in scope 0x805480ea0) locals=0x8054818a8 outer=0x805481298
                        THEN
                           {{ // new scope  locals=0x8054818d0 outer=0x805481298: 
                              //child 0 of 1:
                              { locals=0x805481ff8  outer=0x8054818d0
                                 //child 0 of 2:
                                 ASSIGNMENT  locals=0x805482100 outer=0x805481ff8
                                    VAR' __resultTmp  (in scope 0x805480ca0) locals=0x805482208 outer=0x805480ca0
                                 := at 0x805482010 locals=0x805482100 outer=0x805481ff8
                                    VAR' x  (in scope 0x805480ea0) locals=0x805482228 outer=0x8054818d0
                                 //child 1 of 2:
                                 RETURN
                              }
                           }}
                        ELSE
                           EXPR:  locals=0x805481a20 outer=0x805481298
                              (oper-void)
                        ENDIF
                        //child 1 of 2:
                        EXPR:  locals=0x805481ae0 outer=0x805481298
                              COMMA-SEQ  at 0x805481af8 locals=0x80548b698 outer=0x805481298
                              //child 0 of 3:
                              DECL (locals=0x80548b7d0 outer=0x80548b698) float __resultTmp (0x80548b820) (in scope 0x80548b698) ;
                              //child 1 of 3:
                              {{ // new scope  locals=0x80548b878 outer=0x8041e1398: 
                                 //child 0 of 3:
                                 EXPR:  locals=0x80548b980 outer=0x8041e1398
                                    ASSIGNMENT  locals=0x80548ba10 outer=0x8041e1398
                                       VAR' __resultTmp  (in scope 0x80548b698) locals=0x80548be88 outer=0x80548b698
                                    := at 0x80548b998 locals=0x80548ba10 outer=0x8041e1398
                                       VAR' x  (in scope 0x805480ea0) locals=0x80548bea8 outer=0x805481298
                                 //child 1 of 3:
                                 EXPR:  locals=0x80548bb68 outer=0x8041e1398
                                    ASSIGNMENT  locals=0x80548bbf8 outer=0x8041e1398
                                       VAR' x  (in scope 0x805480ea0) locals=0x80548bec8 outer=0x805481298
                                    := at 0x80548bb80 locals=0x80548bbf8 outer=0x8041e1398
                                       ASM: vec4_subtract at 0x80548bc70 locals=0x80548e378 outer=0x8038e1490
                                          ASM  at 0x80548bc70 locals=0x80548e378 outer=0x8038e1490
                                          //child 0 of 2:
                                          VAR' x  (in scope 0x805480ea0) locals=0x80548e560 outer=0x805481298
                                          //child 1 of 2:
                                          LITERAL (1.000000 )
                                 //child 2 of 3:
                                 LABEL (null)
                              }}
                              //child 2 of 3:
                              VAR __resultTmp  (in scope 0x80548b698)
                     }}
               END WHILE LOOP
               //child 2 of 4:
               { locals=0x805482570  outer=0x805480ea0
                  //child 0 of 2:
                  ASSIGNMENT  locals=0x805482678 outer=0x805482570
                     VAR' __resultTmp  (in scope 0x805480ca0) locals=0x805482780 outer=0x805480ca0
                  := at 0x805482588 locals=0x805482678 outer=0x805482570
                     VAR' x  (in scope 0x805480ea0) locals=0x8054827a0 outer=0x805480ea0
                  //child 1 of 2:
                  RETURN
               }
               //child 3 of 4:
               LABEL (null)
            }}
            //child 2 of 3:
            VAR __resultTmp  (in scope 0x805480ca0)
   }
   //child 2 of 4:
   { locals=0x80547c660  outer=0x80547b680
      //child 0 of 1:
      DECL (locals=0x80547cc28 outer=0x80547b680) float y (0x80547c688) (in scope 0x80547b680)  := INITIALIZER
            COMMA-SEQ  at 0x80547c6e0 locals=0x80548f4e8 outer=0x80547b680
            //child 0 of 3:
            DECL (locals=0x80548f620 outer=0x80548f4e8) float __resultTmp (0x80548f670) (in scope 0x80548f4e8) ;
            //child 1 of 3:
            {{ // new scope  locals=0x80548f6c8 outer=0x80547abf0: x 
               //child 0 of 3:
               DECL (locals=0x805490570 outer=0x80548f6c8) float x (0x805490758) (in scope 0x80548f6c8)  :=
                  ASM: vec4_add at 0x805490588 locals=0x805492b70 outer=0x8038de1d8
                     ASM  at 0x805490588 locals=0x805492b70 outer=0x8038de1d8
                     //child 0 of 2:
                     VAR' attr  (in scope 0x7fffffffad10) locals=0x805492d58 outer=0x80547b680
                     //child 1 of 2:
                     LITERAL (1.000000 )
               //child 1 of 3:
               { locals=0x80548ffd0  outer=0x80548f6c8
                  //child 0 of 2:
                  ASSIGNMENT  locals=0x8054900d8 outer=0x80548ffd0
                     VAR' __resultTmp  (in scope 0x80548f4e8) locals=0x8054901e0 outer=0x80548f4e8
                  := at 0x80548ffe8 locals=0x8054900d8 outer=0x80548ffd0
                     ASM: vec4_multiply at 0x805490150 locals=0x805493158 outer=0x8038e47c0
                        ASM  at 0x805490150 locals=0x805493158 outer=0x8038e47c0
                        //child 0 of 2:
                           COMMA-SEQ  at 0x805493170 locals=0x805493578 outer=0x80548f6c8
                           //child 0 of 3:
                           DECL (locals=0x8054936b0 outer=0x805493578) float __resultTmp (0x805493700) (in scope 0x805493578) ;
                           //child 1 of 3:
                           {{ // new scope  locals=0x805493758 outer=0x8038e7b68: b bInv 
                              //child 0 of 5:
                              DECL (locals=0x805493f28 outer=0x805493758) const float b (0x805493fe0) (in scope 0x805493758)  :=
                                 LITERAL (5 )
                              //child 1 of 5:
                              { locals=0x8054938d8  outer=0x805493758
                                 //child 0 of 1:
                                 DECL (locals=0x805493968 outer=0x805493758) float bInv (0x805494048) (in scope 0x805493758) ;
                              }
                              //child 2 of 5:
                              ASM: float_rcp at 0x805494160 locals=0x805493990 outer=0x805493758
                                 ASM  at 0x805494160 locals=0x805493990 outer=0x805493758
                                 //child 0 of 2:
                                 FIELD x of
                                    VAR' bInv  (in scope 0x805493758) locals=0x805493b28 outer=0x805493758
                                 //child 1 of 2:
                                 VAR' b  (in scope 0x805493758) locals=0x805493b50 outer=0x805493758
                              //child 3 of 5:
                              ASM: vec4_multiply at 0x8054941c0 locals=0x805493b78 outer=0x805493758
                                 ASM  at 0x8054941c0 locals=0x805493b78 outer=0x805493758
                                 //child 0 of 3:
                                 VAR' __resultTmp  (in scope 0x805493578) locals=0x805493d68 outer=0x805493578
                                 //child 1 of 3:
                                 VAR' x  (in scope 0x80548f6c8) locals=0x805493d88 outer=0x80548f6c8
                                 //child 2 of 3:
                                 VAR' bInv  (in scope 0x805493758) locals=0x805493d38 outer=0x805493758
                              //child 4 of 5:
                              LABEL (null)
                           }}
                           //child 2 of 3:
                           VAR __resultTmp  (in scope 0x805493578)
                        //child 1 of 2:
                        LITERAL (5 )
                  //child 1 of 2:
                  (oper-void)
               }
               //child 2 of 3:
               LABEL (null)
            }}
            //child 2 of 3:
            VAR __resultTmp  (in scope 0x80548f4e8)
   }
   //child 3 of 4:
   IF
         VAR' x  (in scope 0x80547b680) locals=0x80547ceb0 outer=0x80547b680
      == at 0x80547dc90 locals=0x80547d0c0 outer=0x80547b680
         VAR' y  (in scope 0x80547b680) locals=0x80547cf88 outer=0x80547b680
   THEN
      {{ // new scope  locals=0x80547d288 outer=0x80547b680: 
         //child 0 of 1:
         EXPR:  locals=0x80547d300 outer=0x80547d288
            ASSIGNMENT  locals=0x80547db88 outer=0x80547d288
               FIELD yz of
                  VAR' gl_Position  (in scope 0x7fffffffacb8) locals=0x80547d3f0 outer=0x80547d288
            := at 0x80547d318 locals=0x80547db88 outer=0x80547d288
                  COMMA-SEQ  at 0x80547dc00 locals=0x8054998f8 outer=0x80547d288
                  //child 0 of 3:
                  DECL (locals=0x805499a30 outer=0x8054998f8) vec2 __resultTmp (0x805499a80) (in scope 0x8054998f8) ;
                  //child 1 of 3:
                  {{ // new scope  locals=0x805499ad8 outer=0x80380d510: x y 
                     //child 0 of 5:
                     DECL (locals=0x80549a248 outer=0x805499ad8) const float x (0x80549a398) (in scope 0x805499ad8)  :=
                           COMMA-SEQ  at 0x80549a260 locals=0x80549cb98 outer=0x80547d288
                           //child 0 of 3:
                           DECL (locals=0x80549ccd0 outer=0x80549cb98) float __resultTmp (0x80549cd20) (in scope 0x80549cb98) ;
                           //child 1 of 3:
                           {{ // new scope  locals=0x80549cd78 outer=0x80380cb18: 
                              //child 0 of 2:
                              EXPR:  locals=0x80549ce08 outer=0x80380cb18
                                 ASSIGNMENT  locals=0x80549ce98 outer=0x80380cb18
                                    VAR' __resultTmp  (in scope 0x80549cb98) locals=0x80549cff8 outer=0x80549cb98
                                 := at 0x80549ce20 locals=0x80549ce98 outer=0x80380cb18
                                    VAR' x  (in scope 0x80547b680) locals=0x80549d018 outer=0x80547d288
                              //child 1 of 2:
                              LABEL (null)
                           }}
                           //child 2 of 3:
                           VAR __resultTmp  (in scope 0x80549cb98)
                     //child 1 of 5:
                     DECL (locals=0x80549a570 outer=0x805499ad8) const float y (0x80549a6c8) (in scope 0x805499ad8)  :=
                           COMMA-SEQ  at 0x80549a588 locals=0x80549f860 outer=0x80547d288
                           //child 0 of 3:
                           DECL (locals=0x80549f998 outer=0x80549f860) float __resultTmp (0x80549f9e8) (in scope 0x80549f860) ;
                           //child 1 of 3:
                           {{ // new scope  locals=0x80549fa40 outer=0x80380cb18: 
                              //child 0 of 2:
                              EXPR:  locals=0x80549fad0 outer=0x80380cb18
                                 ASSIGNMENT  locals=0x80549fb60 outer=0x80380cb18
                                    VAR' __resultTmp  (in scope 0x80549f860) locals=0x80549fcc0 outer=0x80549f860
                                 := at 0x80549fae8 locals=0x80549fb60 outer=0x80380cb18
                                    VAR' y  (in scope 0x80547b680) locals=0x80549fce0 outer=0x80547d288
                              //child 1 of 2:
                              LABEL (null)
                           }}
                           //child 2 of 3:
                           VAR __resultTmp  (in scope 0x80549f860)
                     //child 2 of 5:
                     EXPR:  locals=0x805499be0 outer=0x805499ad8
                        ASSIGNMENT  locals=0x805499c70 outer=0x805499ad8
                           FIELD x of
                              VAR' __resultTmp  (in scope 0x8054998f8) locals=0x80549a0e8 outer=0x8054998f8
                        := at 0x805499bf8 locals=0x805499c70 outer=0x805499ad8
                           VAR' x  (in scope 0x805499ad8) locals=0x805499e30 outer=0x805499ad8
                     //child 3 of 5:
                     EXPR:  locals=0x805499e60 outer=0x805499ad8
                        ASSIGNMENT  locals=0x805499ef0 outer=0x805499ad8
                           FIELD y of
                              VAR' __resultTmp  (in scope 0x8054998f8) locals=0x80549a108 outer=0x8054998f8
                        := at 0x805499e78 locals=0x805499ef0 outer=0x805499ad8
                           VAR' y  (in scope 0x805499ad8) locals=0x80549a0b0 outer=0x805499ad8
                     //child 4 of 5:
                     LABEL (null)
                  }}
                  //child 2 of 3:
                  VAR __resultTmp  (in scope 0x8054998f8)
      }}
   ELSE
      {{ // new scope  locals=0x80547ddb0 outer=0x80547b680: 
         //child 0 of 1:
         EXPR:  locals=0x80547de28 outer=0x80547ddb0
            ASSIGNMENT  locals=0x80547e698 outer=0x80547ddb0
               FIELD yxz of
                  VAR' gl_Position  (in scope 0x7fffffffacb8) locals=0x80547df18 outer=0x80547ddb0
            := at 0x80547de40 locals=0x80547e698 outer=0x80547ddb0
               LITERAL (0.400000 0.500000 0.600000 )
      }}
   ENDIF
}



Here's the intermediate representation that this gets turned into:

NEW SCOPE
   COPY
      VAR gl_Position at ENV_PARAM[0]  store 0x805458248
      FLOAT 0 0 0 1
   VAR_DECL x (0x80547c290) at TEMP[-7]  store 0x805480b40
   COPY
      VAR x at TEMP[-7]  store 0x805480b40
      VAR_DECL __resultTmp (0x805480e28) at TEMP[-7]  store 0x805482c58
      CALL dontinlineme_1
      VAR __resultTmp at TEMP[-7]  store 0x805482c58
   VAR_DECL y (0x80547c688) at TEMP[-7]  store 0x80548f390
   COPY
      VAR y at TEMP[-7]  store 0x80548f390
      VAR_DECL __resultTmp (0x80548f670) at TEMP[-7]  store 0x805490958
      NEW SCOPE
         VAR_DECL x (0x805490758) at TEMP[-7]  store 0x805492a10
         COPY
            VAR x at TEMP[-7]  store 0x805492a10
            IR_ADD (0x805492d98, 0x805492e08)  (store 0x805492f18)
               VAR attr.x??? at LOCAL_PARAM[16]  store 0x80547b650
               FLOAT 1 1 1 1
         COPY
            VAR __resultTmp at TEMP[-7]  store 0x805490958
            IR_MUL (0x805496db0, 0x805496e20)  (store 0x805496f30)
               VAR_DECL __resultTmp (0x805493700) at TEMP[-7]  store 0x805494308
               NEW SCOPE
                  VAR_DECL b (0x805493fe0) at TEMP[-7]  store 0x8054963c0
                  COPY
                     VAR b at TEMP[-7]  store 0x8054963c0
                     FLOAT 5 5 5 5
                  VAR_DECL bInv (0x805494048) at TEMP[-7]  store 0x805496650
                  IR_RCP (0x8054966f0, 0x0)  (store 0x8054968b0)
                     VAR b at TEMP[-7]  store 0x8054963c0
                  IR_MUL (0x805496950, 0x8054969c0)  (store 0x805494308)
                     VAR x at TEMP[-7]  store 0x805492a10
                     VAR bInv at TEMP[-7]  store 0x805496650
                  LABEL: __endOfFunc_/_
               VAR __resultTmp at TEMP[-7]  store 0x805494308
               FLOAT 5 5 5 5
         IR_NOP (0x0, 0x0)  (store 0x0)
         LABEL: __endOfFunc_inlineme_
      VAR __resultTmp at TEMP[-7]  store 0x805490958
   IF 
      COND
         IR_EQUAL (0x805497580, 0x805497510)  (store 0x805497660)
            VAR x at TEMP[-7]  store 0x805480b40
            VAR y at TEMP[-7]  store 0x80548f390
   THEN
      NEW SCOPE
         COPY
            SWIZZLE .yz?? of  (store 0x8054997f8) 
               VAR gl_Position at ENV_PARAM[0]  store 0x805458248
            SWIZZLE .xxyw of  (store 0x8054a2c40) 
               VAR_DECL __resultTmp (0x805499a80) at TEMP[-7]  store 0x80549a988
               NEW SCOPE
                  VAR_DECL x (0x80549a398) at TEMP[-7]  store 0x80549ca40
                  COPY
                     VAR x at TEMP[-7]  store 0x80549ca40
                     VAR_DECL __resultTmp (0x80549cd20) at TEMP[-7]  store 0x80549d180
                     NEW SCOPE
                        COPY
                           VAR __resultTmp at TEMP[-7]  store 0x80549d180
                           VAR x at TEMP[-7]  store 0x805480b40
                        LABEL: __endOfFunc_float_
                     VAR __resultTmp at TEMP[-7]  store 0x80549d180
                  VAR_DECL y (0x80549a6c8) at TEMP[-7]  store 0x80549f708
                  COPY
                     VAR y at TEMP[-7]  store 0x80549f708
                     VAR_DECL __resultTmp (0x80549f9e8) at TEMP[-7]  store 0x80549fe48
                     NEW SCOPE
                        COPY
                           VAR __resultTmp at TEMP[-7]  store 0x80549fe48
                           VAR y at TEMP[-7]  store 0x80548f390
                        LABEL: __endOfFunc_float_
                     VAR __resultTmp at TEMP[-7]  store 0x80549fe48
                  COPY
                     SWIZZLE .x??? of  (store 0x8054a24b0) 
                        VAR __resultTmp at TEMP[-7]  store 0x80549a988
                     VAR x at TEMP[-7]  store 0x80549ca40
                  COPY
                     SWIZZLE .y??? of  (store 0x8054a2710) 
                        VAR __resultTmp at TEMP[-7]  store 0x80549a988
                     SWIZZLE .xxzw of  (store 0x8054a2820) 
                        VAR y at TEMP[-7]  store 0x80549f708
                  LABEL: __endOfFunc_vec2_
               VAR __resultTmp at TEMP[-7]  store 0x80549a988
   ELSE
      NEW SCOPE
         COPY
            SWIZZLE .yxz? of  (store 0x8054a4e48) 
               VAR gl_Position at ENV_PARAM[0]  store 0x805458248
            SWIZZLE .yxzw of  (store 0x8054a4f88) 
               FLOAT 0.4 0.5 0.6 0.6
   ENDIF
LABEL: __endOfFunc__main



And here's the actual generated code:

# Vertex Program/Shader
  0: MOV OUTPUT[0], CONST[0];
  1: CAL 22;  # dontinlineme_1
  2: MOV TEMP[0].x, TEMP[0].yyyy;
  3: ADD TEMP[0].w, INPUT[16].xxxx, CONST[0].wwww;
  4: MOV TEMP[1].z, CONST[1].xxxx;
  5: RCP TEMP[1].w, TEMP[1].zzzz;
  6: MUL TEMP[1].y, TEMP[0].wwww, TEMP[1].wwww;
  7: MUL TEMP[0].z, TEMP[1].yyyy, CONST[1].xxxx;
  8: MOV TEMP[0].y, TEMP[0].zzzz;
  9: SEQ TEMP[0].z, TEMP[0].xxxx, TEMP[0].yyyy;
 10: IF TEMP[0].zzzz;  # (if false, goto 19);
 11:    MOV TEMP[1].z, TEMP[0].xxxx;
 12:    MOV TEMP[0].w, TEMP[1].zzzz;
 13:    MOV TEMP[1].w, TEMP[0].yyyy;
 14:    MOV TEMP[1].z, TEMP[1].wwww;
 15:    MOV TEMP[1].x, TEMP[0].wwww;
 16:    MOV TEMP[1].y, TEMP[1].zzzw;
 17:    MOV OUTPUT[0].yz, TEMP[1].xxyw;
 18: ELSE; # (goto 21)
 19:    MOV OUTPUT[0].xyz, CONST[2].yxzw;
 20: ENDIF;
 21: END
 22: BGNSUB;  # dontinlineme_1
 23:    MOV TEMP[0].z, INPUT[16].xxxx;
 24:    BGNLOOP; # (end at 40)
 25:       SLE TEMP[0].w, TEMP[0].zzzz, CONST[0].xxxx;
 26:       IF TEMP[0].wwww;  # (if false, goto 29);
 27:          BRK (TR); # (goto 41);
 28:       ENDIF;
 29:       MOV TEMP[1].y, CONST[1].xxxx;
 30:       RCP TEMP[1].z, TEMP[1].yyyy;
 31:       MUL TEMP[1].x, TEMP[0].zzzz, TEMP[1].zzzz;
 32:       MUL TEMP[1].y, TEMP[1].xxxx, CONST[1].xxxx;
 33:       SEQ TEMP[1].x, TEMP[1].yyyy, TEMP[0].zzzz;
 34:       IF TEMP[1].xxxx;  # (if false, goto 37);
 35:          MOV TEMP[0].y, TEMP[0].zzzz;
 36:          RET (TR);
 37:       ENDIF;
 38:       MOV TEMP[1].y, TEMP[0].zzzz;
 39:       SUB TEMP[0].z, TEMP[0].zzzz, CONST[0].wwww;
 40:    ENDLOOP; # (goto 24)
 41:    MOV TEMP[0].y, TEMP[0].zzzz;
 42:    RET (TR);
 43: ENDSUB;  # dontinlineme_1
InputsRead: 0x10000 (0b1,00000000,00000000)
OutputsWritten: 0x0 (0b0)
NumInstructions=44
NumTemporaries=0
NumParameters=0
NumAttributes=0
NumAddressRegs=0
SamplersUsed: 0x0 (0b0)
Samplers=[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ]
param list 0x802cb4280
dirty state flags: 0x0
param[0] sz=4 CONST (null) = {0, 0, 0, 1}
param[1] sz=1 CONST (null) = {5, 5, 5, 5}
param[2] sz=3 CONST (null) = {0.4, 0.5, 0.6, 0.6}

Hopefully that helps make things clearer.


Design of Mesa 3D Part 7: Shader Assembly Emission

The last stage in glCompileShader is to actually emit assembly commands that can be executed by the shader VM. We've previously created an intermediate representation of nodes that represent the shader; now the task is to serialize this tree into something similar to an object file. I believe that the actual assembly language is the ARB assembly language, which can then be translated by a driver into platform-specific instructions. This architecture is similar to HLSL's "assembly" language. This takes place in emit(), defined in src/mesa/shader/slang/slang_emit.c. You pass a slang_ir_node to the function; the initial node is the root of the IR tree.

This function is a big switch statement switching over the IR opcode. All of the math operators (abs, sin, min, add, less-than, etc.) fall through to the same case, which calls emit_arith(). This function is actually really straightforward; it calls emit() on all of its children, allocates a node to store the result by calling alloc_node_storage(), then calls emit_instruction() on the operation itself.

alloc_node_storage() is also fairly straightforward; it's used to allocate temporaries that don't have the Store parameter in the slang_ir_node struct set. This is a code block from the beginning of the function:


   if (!n->Store) {
      assert(defaultSize > 0);
      n->Store = _slang_new_ir_storage(PROGRAM_TEMPORARY, -1, defaultSize);
   }

Therefore, an invariant is that n->Store should always be set for an IR node after this function is called on it. _slang_new_ir_storage() is a simple constructor that just copies the register file, index, and size into a newly allocated slang_ir_storage struct. I've already copied and pasted the definition of slang_ir_storage_ in this post. One of the interesting things is that the index parameter in slang_ir_storage_ is allowed to be -1, which means that the actual location doesn't matter; just put it anywhere (as long as it's in the correct register file). Because of this, alloc_node_storage() must choose a real index for all the -1 indexes. It does this by calling _slang_alloc_temp(), defined in src/mesa/shader/slang/slang_vartable.c. This function calls alloc_reg() to actually do the allocation, then sets up the slang_ir_store to the appropriate values regarding the newly allocated register. alloc_rec() uses the struct table defined at the top of the same file. This struct represents meta information about which parts of which register files are free. Here's the definition:


typedef enum {
   FREE,
   VAR,
   TEMP
} TempState;

/**
 * Variable/register info for one variable scope.
 */
struct table
{
   int Level;
   int NumVars;
   slang_variable **Vars;  /* array [NumVars] */
   TempState Temps[MAX_PROGRAM_TEMPS * 4];  /* per-component state */
   int ValSize[MAX_PROGRAM_TEMPS * 4];     /**< For debug only */
   struct table *Parent;  /** Parent scope table */
};

The algorithm that alloc_rec() uses is quite straightforward; It simply walks Temps trying to find 4 successive components that are marked as FREE. Once it's found one, it marks them all as TEMP. So that's pretty simple.

Back to emitting code. Emitting a single instruction is handled with the emit_instruction() function. This function takes an opcode and 4 slang_ir_storage nodes: one for the destination and 3 for the inputs. You would think that this function would be trivial; however, because of indirect register inputs + outputs, it isn't. If the output or any of the inputs are indirect, this function has to deal with it. I'll skip over how we deal with this for now, but once we have our input and output registers, the code just looks like this:

   inst = new_instruction(emitInfo, opcode);
   if (!inst)
      return NULL;

   if (dst)
      storage_to_dst_reg(&inst->DstReg, dst);

   for (i = 0; i < 3; i++) {
      if (src[i])
         storage_to_src_reg(&inst->SrcReg[i], src[i]);
   }

new_instruction() is the trivial function: If we're at the end of our output array, grow the buffer, then just get a pointer to the next available instruction in the array, and initialize it. The instruction stream is attached to the gl_program object stored in emitInfo->prog(); this will become important when we call functions. storage_to_dst_reg() and storage_to_src_reg() are also rather simple: They simply fill in the register file and index, as well as a swizzle. Here are the prog_dst_register and prog_src_register structs, defined in src/mesa/shader/prog_instruction.h.

struct prog_src_register
{
   GLuint File:4; /**< One of the PROGRAM_* register file values. */
   GLint Index:(INST_INDEX_BITS+1); /**< Extra bit here for sign bit.
                                     * May be negative for relative addressing.
                                     */
   GLuint Swizzle:12;
   GLuint RelAddr:1;
   /** Take the component-wise absolute value */
   GLuint Abs:1;
   /**
    * Post-Abs negation.
    * This will either be NEGATE_NONE or NEGATE_XYZW, except for the SWZ
    * instruction which allows per-component negation.
    */
   GLuint Negate:4;
};

/**
 * Instruction destination register.
 */
struct prog_dst_register
{
   GLuint File:4;      /**< One of the PROGRAM_* register file values */
   GLuint Index:INST_INDEX_BITS;  /**< Unsigned, never negative */
   GLuint WriteMask:4;
   GLuint RelAddr:1;
   /**
    * \name Conditional destination update control.
    *
    * \since
    * NV_fragment_program, NV_fragment_program_option, NV_vertex_program2,
    * NV_vertex_program2_option.
    */
   /*@{*/
   /**
    * Takes one of the 9 possible condition values (EQ, FL, GT, GE, LE, LT,
    * NE, TR, or UN).  Dest reg is only written to if the matching
    * (swizzled) condition code value passes.  When a conditional update mask
    * is not specified, this will be \c COND_TR.
    */
   GLuint CondMask:4;
   /**
    * Condition code swizzle value.
    */
   GLuint CondSwizzle:12;
   /**
    * Selects the condition code register to use for conditional destination
    * update masking.  In NV_fragmnet_program or NV_vertex_program2 mode, only
    * condition code register 0 is available.  In NV_vertex_program3 mode,
    * condition code registers 0 and 1 are available.
    */
   GLuint CondSrc:1;
   /*@}*/
   GLuint pad:28;
};

As you can see, the instruction is optimized for size by using bitfields.

Alright, let's talk about indirect registers. This indirection is done using the ARL instruction, or Address Register Load. The spec (Section 2.14.5.3) states that it simply performs a load into the address register, which is then used for future loads and stores. This is used for doing array accesses where the index is a variable; that requires loading the value of the variable into the address register, then doing an operation using the address register as an offset into the array. However, we only have one address register (only the x component is actually used). What happens if we want to say something like x[i] + y[j]? The add instruction uses the address register explicitly, but the two operands should have different offsets. This means that we have to first load x[i] into a temporary, then run temp + y[j]. Allocating this temporary register uses the same call that it did above. It then emits a MOV instruction using the address register. A similar codepath occurs for an indirect destination register; however, if the destination is relative, all of the relative sources will be put into temporaries, so we can use an indirect destination here. The RelAddr bit in the prog_dst_register and prog_src_register structs determines if we should use the address register. After we emit the actual instruction that we're trying to perform, we have to then free the temporary registers that we've allocated.

Cool; that's how we do math. Register loads and stores work the same way. IR_SEQ instructions work exactly the way you would expect. A variable declaration tries to call _slang_alloc_var(), which works similarly to _slang_alloc_temp(). The IR_NOT operator is implemented as v = v == 0, which is cool.

All right, what about comparisons? Because performing less-than and greater-than operations doesn't makes sense on structs and vectors, it is handled by emit_arith(). However, equality comparisons work almost exactly the same way, except that we have to be able to compare structs and vectors, etc. Comparing two floats is straightforward; just call emit_instruction(). Comparing two vectors is a little more complicated, because the comparison instruction returns a vector of outputs, for each component. We can solve this by computing the dot product of the output with itself, and looking at the output. This requires allocating a temporary. Now, what about structs? This just allocates an accumulator, and walks through the size of the object, adding the output of the comparisons to the accumulator. Then, we can use the dot product trick again. Note that this won't work with arrays with padding; this is kind of an interesting problem (which doesn't look like is solved in this version of Mesa).

Alright, how about loops? There is an IR_LOOP instruction, which triggers a call to emit_loop(). There is a flag in the slang_emit_info structure which determines if we should emit so-called "high level" instructions. If so, we can simply emit a OPCODE_BGNLOOP instruction, which is pretty cool. Before we do that, we save the number of previously-emitted instructions to use for a label to jump to, should we need to. Then we can just emit the body of the loop (the 0th child of the IR loop), and then possibly emit OPCODE_ENDLOOP. Otherwise, we emit a OPCODE_BRA (branch) instruction, and set the target to the beginning of the loop. Once we've done that, we have to walk through the instructions in the loop, looking for IR_BREAK and IR_CONT nodes, and replacing them with OPCODE_BRA nodes. Now we're done!

Sampling from a texture is simply an instruction, so that doesn't add much complexity.

The last piece I'd like to get into is function calls. Because setting up all the arguments and return value was done when creating the IR (as well as as much inlining as possible), calling functions is actually fairly simple. Because instruction streams are attached to gl_program objects, we save the current gl_program object (originally in emitInfo->prog) and create a new program by calling new_subroutine() which delegates to ctx->Driver.NewProgram(). Then, we can emit a label for the new function, call emit() on the function body, and a return instruction just in case. We also might surround the function with OPCODE_BGNSUB and OPCODE_ENDSUB instructions, if the emitInfo->EmitBeginEndSub is set. Once we've emitted the new function, we set the active program to the original saved value and emit the OPCODE_CAL instruction to that stream.

Cool! Now we've got a stream of instructions that our VM can execute. Before getting into VM execution, the OpenGL pipeline, or linking shaders, I'd like to show the life of an example function, with all its intermediate forms along the way of compilation. I think that'll make the shader compilation steps clearer.