Litherum: February 2013

Wednesday, February 27, 2013

Sprites in the OpenGL Programmable Pipeline

Recently, I was on Stack Overflow and I found an interesting question regarding sprites in the programmable pipeline. I was curious, and did some research. I'm not sure that "sprites" is exactly the right terminology for this, but I believe that this is as close to "sprites" as the programmable pipeline gets. Anyway, I wrote a little test program (for GLX and OpenGL 3.2 and beyond). Hope you find it interesting!

Compile with:
clang++ -DGL_GLEXT_PROTOTYPES -DGLX_GLXEXT_PROTOTYPES -Wall -O0 -ggdb -o main $(pkg-config --libs --cflags gl x11) main.cc

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <string.h>
#include <time.h>
#include <inttypes.h>
#include <GL/gl.h>
#include <GL/glext.h>
#include <GL/glx.h>
#include <GL/glxext.h>
#include <X11/Xlib.h>

static int done = 0;

static inline uint64_t monotonicTime() {
struct timespec ts;
int res = clock_gettime(CLOCK_MONOTONIC, &ts);
assert(!res);
uint64_t b = ts.tv_sec * 1000000;
return b + ts.tv_nsec / 1000;
}

static int handler(Display *display) {
done = 1;
return 0;
}

class OpenGLStuff {
public:
OpenGLStuff(const int width, const int height) : point_count(30), circle_count(10) {
// Print some debug stuff
printf("%s\n", glGetString(GL_VENDOR));
printf("%s\n", glGetString(GL_RENDERER));
printf("%s\n", glGetString(GL_VERSION));
printf("%s\n", glGetString(GL_SHADING_LANGUAGE_VERSION));
// Set up some boring first-run stuff
glViewport(0, 0, width, height);
glClearColor(0.0f, 0.0f, 0.0f, 1.0f);
glEnable(GL_BLEND);
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
glEnable(GL_PROGRAM_POINT_SIZE);
// I'm actually interested in the maximum sized sprite I can draw is
float range[2];
glGetFloatv(GL_POINT_SIZE_RANGE, range);
printf("Point size range: %f x %f\n", range[0], range[1]);

const GLchar *vertex_shader_source[] = {
//"#version 430\n",
"#version 150\n",
"\n",
"in vec4 in_position;\n",
"uniform int in_point_count;\n",
"uniform int in_circle_count;\n",
"uniform int in_time;\n",
"\n",
"void main() {\n",
" float p = float(gl_InstanceID) / float(in_point_count);\n",
" float pi = asin(1.0) * 2.0;\n",
" float speed = (1.0 / 2.0) * 1000000.0 * 2.0 * pi;\n",
" float radius = 0.1*sin(float(in_time)/(speed/10)) + (float(gl_InstanceID/in_point_count)/float(in_circle_count)) * sin(float(in_time)/speed);\n",
" gl_Position = in_position + vec4(radius*sin(2*pi*p), radius*cos(2*pi*p), 0.0, 0.0);\n",
" gl_PointSize = (sin(radius*2.0*pi + float(in_time)/(speed*4.0))+1.0)/2.0*63.0*20.0;\n",
"}\n",
""
};
const GLchar *fragment_shader_source[] = {
//"#version 430\n",
"#version 150\n",
"\n",
"out vec4 out_color;\n",
"\n",
"void main() {\n",
" vec2 c = (gl_PointCoord - vec2(0.5, 0.5)) * 2;\n",
" float v = c.x*c.x + c.y*c.y;\n",
" if (v < 1.0) {\n",
" out_color = (1.0-v*v)*vec4(0.0, gl_PointCoord.y, gl_PointCoord.x, 1.0);\n",
" } else {\n",
" out_color = vec4(0.0, 0.0, 0.0, 0.0);\n",
" }\n",
"}\n",
""
};

// Set up shader stuff
GLint compile_status;
GLuint vertex_shader = glCreateShader(GL_VERTEX_SHADER);
GLuint fragment_shader = glCreateShader(GL_FRAGMENT_SHADER);
program = glCreateProgram();
glShaderSource(vertex_shader, sizeof(vertex_shader_source)/sizeof(GLchar*), vertex_shader_source, NULL);
glShaderSource(fragment_shader, sizeof(fragment_shader_source)/sizeof(GLchar*), fragment_shader_source, NULL);
glCompileShader(vertex_shader);
glCompileShader(fragment_shader);
glGetShaderiv(vertex_shader, GL_COMPILE_STATUS, &compile_status);
assert(compile_status == GL_TRUE);
glGetShaderiv(fragment_shader, GL_COMPILE_STATUS, &compile_status);
assert(compile_status == GL_TRUE);
glAttachShader(program, vertex_shader);
glAttachShader(program, fragment_shader);
glDeleteShader(vertex_shader);
glDeleteShader(fragment_shader);
glBindFragDataLocation(program, 0, "out_color");
glLinkProgram(program);

// Get variable linkages
glUseProgram(program);
GLint position_attribute = glGetAttribLocation(program, "in_position");
GLint point_count_uniform = glGetUniformLocation(program, "in_point_count");
GLint circle_count_uniform = glGetUniformLocation(program, "in_circle_count");
time_uniform = glGetUniformLocation(program, "in_time");

float position_data[] = {0.0f, 0.0f, 0.0f, 1.0f};
amount_of_data = sizeof(position_data)/(sizeof(float)*4);

// Upload data to the card
glGenVertexArrays(1, &vertex_array);
glBindVertexArray(vertex_array);
glGenBuffers(1, &vertex_position_buffer);
glBindBuffer(GL_ARRAY_BUFFER, vertex_position_buffer);
glBufferData(GL_ARRAY_BUFFER, sizeof(position_data), position_data, GL_STATIC_DRAW);
glEnableVertexAttribArray(position_attribute);
glVertexAttribPointer(position_attribute, 4, GL_FLOAT, GL_FALSE, 0, 0);

// Specify some constants to the shader program
glUniform1i(point_count_uniform, point_count);
glUniform1i(circle_count_uniform, circle_count);

// Make sure everything went well
GLenum error = glGetError();
assert(error == GL_NO_ERROR);

beginning = monotonicTime();
}
~OpenGLStuff() {
glDeleteProgram(program);
glDeleteBuffers(1, &vertex_position_buffer);
glDeleteVertexArrays(1, &vertex_array);
}
void DrawScene() {
uint64_t before = monotonicTime();
glClear(GL_COLOR_BUFFER_BIT);
glUniform1i(time_uniform, (GLint)(before - beginning));
glDrawArraysInstanced(GL_POINTS, 0, amount_of_data, circle_count*point_count);
glFlush();
}
void Resize(const int width, const int height) {
glViewport(0, 0, width, height);
}
private:
uint64_t beginning;
GLint time_uniform;
const int point_count;
const int circle_count;
size_t amount_of_data;
GLuint vertex_array;
GLuint vertex_position_buffer;
GLuint program;
};

class GLXStuff {
public:
GLXStuff(const int width, const int height) {
/*
Step 1: Get a connection to the X server
Step 2: Get a FBConfig
Step 3: Create a versioned context
Step 4: Create a drawable
Step 5: Make the context current on the drawable
*/
int fbcount;
Bool success;
// Step 1
display = XOpenDisplay(NULL);
XSetIOErrorHandler(handler);
int screen = DefaultScreen(display);
Window root_window = RootWindow(display, screen);
assert(display);
printf("%d screen(s)\n", ScreenCount(display));
// Step 2
const int visual_attribs[] = {
GLX_DOUBLEBUFFER, True,
GLX_RED_SIZE, 8,
GLX_GREEN_SIZE, 8,
GLX_BLUE_SIZE, 8,
GLX_ALPHA_SIZE, 8,
GLX_DEPTH_SIZE, 24,
GLX_X_VISUAL_TYPE, GLX_TRUE_COLOR,
//GLX_CONFIG_CAVEAT, GLX_NONE,
//GLX_SAMPLE_BUFFERS, 1,
//GLX_SAMPLES, 16,
None
};
GLXFBConfig* configs = glXChooseFBConfig(display, screen, visual_attribs, &fbcount);
assert(configs);
assert(fbcount);
printf("%d configs\n", fbcount);
// Naively take the first config
GLXFBConfig config = configs[0];
// Step 3
// If you don't want a versioned context, you can use this function and
// GLX will return you ::some:: version of a context. You'll then have to
// use glGet(GL_MAJOR_VERSION) and glGet(GL_MINOR_VERSION) to figure out
// if what it gave us is acceptable.
//context = glXCreateNewContext(display, config, GLX_RGBA_TYPE, NULL, True);
const int context_attribs[] = {
GLX_CONTEXT_MAJOR_VERSION_ARB, 3,
GLX_CONTEXT_MINOR_VERSION_ARB, 2,
//GLX_CONTEXT_MAJOR_VERSION_ARB, 4,
//GLX_CONTEXT_MINOR_VERSION_ARB, 3,
None
};
context = glXCreateContextAttribsARB(display, config, NULL, True, context_attribs);
assert(context);
XFree(configs);
// Step 4
XVisualInfo *visual_info = glXGetVisualFromFBConfig(display, config);
assert(visual_info);
XSetWindowAttributes swa;
Colormap cmap = XCreateColormap(display, root_window, visual_info->visual, AllocNone);
swa.colormap = cmap;
swa.background_pixmap = None;
swa.border_pixel = BlackPixel(display, screen);
swa.event_mask = StructureNotifyMask;
window = XCreateWindow(display, RootWindow(display, screen), 0, 0, width, height, 50, visual_info->depth, InputOutput, visual_info->visual, CWBorderPixel|CWColormap|CWEventMask, &swa);
assert(window);
XMapWindow(display, window);
// Step 5
success = glXMakeCurrent(display, window, context);
assert(success);
}

~GLXStuff() {
XDestroyWindow(display, window);
glXDestroyContext(display, context);
XCloseDisplay(display);
}
void SwapBuffers() {
glXSwapBuffers(display, window);
}
void HandleEvents(OpenGLStuff &opengl_stuff) {
while (XPending(display)) {
XEvent event;
XNextEvent(display, &event);
switch (event.type) {
case ConfigureNotify:
opengl_stuff.Resize(event.xconfigure.width, event.xconfigure.height);
break;
case ClientMessage:
done = 1;
break;
}
}
}
private:
Display* display;
Window window;
GLXContext context;
};

int main(int argc, char *argv[]) {
const int width = 1300, height = 1300;
GLXStuff glx_stuff(width, height);
OpenGLStuff opengl_stuff(width, height);
printf("Done!\n");

// 60 FPS loop. glXSwapBuffers is synchronous.
while (!done) {
glx_stuff.HandleEvents(opengl_stuff);
if (!done) {
opengl_stuff.DrawScene();
glx_stuff.SwapBuffers();
}
}
return 0;
}

Thursday, February 21, 2013

GLSL with medium precision on large screens

A pixel shader is a function that is evaluated over every pixel that a piece of geometry covers. It outputs the color that that pixel should be colored. Let's consider someone drawing a single textured quad that is the size of the entire screen, using mediump precision (more on this later). In a pixel shader, you can ask a sampler what color a texture is at a given location, using the texture2D() GLSL function, and it will return the color. Let's consider a pixel shader that just outputs whatever this function returns.

When writing GLSL shaders, you have the option to specify the default precision for floating point math. There are three options: lowp, mediump, and highp.

The OpenGL ES Shading language spec version 1.0.17 says that mediump precision requires floats to have a relative precision of 2^(-10) (Section 4.5.2). The manual page for glGetShaderPrecisionFormat describes the meaning of that number: It is the size of a ULP in the range between 1 and 2. Because the exponent in the float is constant between 1 and 2 (set at 0), this means that the mantissa of the float has 10 bits in it, and can therefore take on 1024 values.

Samplers, such as the one described above, have to query the texture within the bounds of 0.0 and 1.0, with 0.0 being the top/left of the texture and 1.0 being the bottom/right of the texture. Let's just pay attention to the range between 0.5 and 1.0 (meaning: the bottom half of the texture). Inside the range of 0.5 to 1.0, the exponent in the float is set at (-1), which means that there are only 1024 different representable values within this range, because the mantissa is 10 bits.

However, when drawing a texture over the entire screen, the sampler is invoked on every pixel. Tablets with high resolutions (such as 1080p) are becoming increasingly common. In a 1080p screen, one dimension of the screen may be around 1920 pixels high. If we pay attention to the bottom half of the screen, it is 1920/2=960 pixels high.

Because 960 is close to 1024, there isn't a good mapping from pixels on the bottom half of the screen to texture coordinates between 0.5 and 1.0. What's more, the sampler has to map these 1024 texture coordinates back to the 960 texels in the texture. It's possible that, if you're drawing a fullscreen texture with mediump precision, you'll have a fair amount of repeated texels being outputted in the bottom/right half of your image.

There's actually a little more to the story here. Many drivers recognize these "pass-through" shaders and optimize them into blitting calls from textures to the framebuffer. Because of that, this kind of bug is only visible on sufficiently complicated shaders, so that the driver doesn't recognize that it's just a pass-through.

Thursday, February 14, 2013

I wrote a little cheat-sheet for data transfer in OpenGL. Here it is. Hope you find it useful!

From\To	Texture	Framebuffer	Host Memory
Texture	Bind texture to framebuffer using glFramebufferTexture	glDraw*	If OES, bind texture to framebuffer using glFramebufferTexture. Else, glGetTexImage
Frame-buffer	glCopyTexImage2D	Multiple render targets?	glReadPixels
Host Memory	glTex(Sub)Image2D. Can use glTexStorage2D to allocate if using new GL	Host memory -> Texture -> Framebuffer	memcpy?

Wednesday, February 6, 2013

JavaScriptCore Primitive Encoding

The JavaScriptCore interpreter has a primitive type (called JSValue, inside WebKit/Source/JavaScriptCore/runtime/JSCJSValue.h) which can represent the following types in JavaScript:

Int32
Double
Cell (meaning: a pointer to another data structure)
Boolean
Null, undefined, empty, recently deleted (I'm grouping all these together since they're conceptually similar and have only one valid value each, similar to the "Unit Type")

The size of a JSValue, however, is 64-bits, no matter what type it is holding. It is not a polymorphic type. Instead, it is implemented as a union.

JavaScriptCore runs on 32-bit machines and 64-bit machines. The layout of the type is slightly different for the two architectures. I'll start with 32-bit machines.

The overall bit-layout of a JSValue on a 32 bit machine is as follows:

|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|
|----------------tag----------------|--------------payload--------------|

The format for everything that isn't a double is obvious - assign each type a unique tag, and put it's 32-bit payload in bytes 5-8 (since ints and pointers are both 32 bits). But then, how do we encode a double, since a double alone takes up the whole 64-bits? JavaScriptCore is tricky about this - it assigns the tags starting at 0xFFFFFFFF and counts down. The reason for this is that a double encodes "QNaN" if bits 1-13 are set (counting from the left). Therefore, if any of the tags are set, the bits in the JSValue (when interpreted as a double) would encode a NaN. There are also more NaN values available, so it's possible to encode an ::actual:: double NaN. So, to tell what type a JSValue is, just see if the tag matches, and if it doesn't, the value is a double.

----------------

Now, for a 64-bit machine. The types are all the same size, except that now pointers are 64 bits (ints are still 32 bits even on a 64-bit machine). So, how can we tag the pointer, even though it takes up the whole 64-bits? Well, JavaScriptCore makes an observation that in most pointers, the top two bytes are usually 0 (Yes, it is true that addresses are arbitrary due to virtual memory, but usually machines don't get that high when allocating addresses. To get up that high in the first place would require this process claiming 281 terabytes of memory). We can, then, recognize a pointer as having its top two bytes == 0. So then, how do we distinguish a pointer whose top two bytes are 0 from a double whose top two bytes are 0? Well, we can artificially increment the top two bytes of the double. So, to encode a double, we'd first reinterpret the bits as if they were an int64, then add 0x0001000000000000. To get back the original double, just subtract that value and reinterpret the bits as a double. This won't overflow because the only values that would overflow have the leftmost 16 bits set, but that's a NaN. The same argument can be used to show that the leftmost two bytes won't ever end up being 0xFFFF either.

|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|
|00000000000000000-----------pointer value------------------------------|
| 0x0001 - 0xFFFE ------------double value------------------------------|

Well, what about the other types that a JSValue can represent? Our old tagging mechanism (used on 32-bit processors) won't work because our tag has now been shortened to only only 16 bits, but in order to encode a NaN, the first 13 have to be set. This means that we only have 3 bits to play with, which isn't enough (we might want to add more than 8 extra types). But, never fear! Just like there are invalid double values, there are also invalid pointer values! Namely, pointer values that would map to the first page of memory (which is usually mapped with no permissions, so these addresses are invalid). Therefore, we can use the rightmost few bits as a tag. All the types except for Int don't actually encode much (don't have much entropy), so JavaScriptCore uses combinations of 0x2, 0x4, and 0x8 to create the necessary tags. However, for ints, it actually does use one of the NaN values (using a leftmost 0xFFFF tag) which encodes int32s, so the rightmost 4 bytes can be the payload. This would be faster than putting the int32 in the middle of the 64-bit value (something like making bytes 3-6 specify the payload) because you wouldn't need extra shifting operations to recreate the int.

|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|
|FFFFFFFFFFFFFFFFF|00000000000000000|-----------integer value-----------|
|00000000000000000|-------------------------------------------------TAG-|