33 #include <libFreeWRL.h>
36 #include "RenderFuncs.h"
37 #include "../vrml_parser/Structs.h"
39 #include "../main/headers.h"
41 #include "LinearAlgebra.h"
42 #include "Collision.h"
44 #if !defined(_ANDROID)
45 #include "../opencl/OpenCL_Utils.h"
49 static const char* collide_non_walk_kernel;
50 static const char* collide_non_walk_kernel_headers;
52 #define FLOAT_TOLERANCE 0.00000001
65 cl_platform_id cpPlatform = NULL;
66 cl_device_id* cdDevices = NULL;
67 cl_uint uiTargetDevice = 0;
83 bool bGLinteropSupported =
false;
84 cl_uint uiNumDevsUsed = 1;
85 bool bGLinterop =
false;
118 #define shrLog printf
124 bool collision_initGPUCollide (
struct sCollisionGPU* initme) {
127 p = (ppOpenCL_Utils)tg->OpenCL_Utils.prv;
131 size_t kernel_wg_size;
138 kp[0] = (
char *)collide_non_walk_kernel_headers;
139 kp[1] = (
char *)collide_non_walk_kernel;
141 initme->CollideGPU_program = clCreateProgramWithSource(p->CL_context, 2, (
const char **) kp, NULL, &err);
144 if (!initme->CollideGPU_program || (err != CL_SUCCESS)) {
145 printCLError(
"clCreateProgramWithSource",err);
157 err = clBuildProgram(initme->CollideGPU_program, 1, &(p->CL_device_id), opts, NULL, NULL);
161 if (err != CL_SUCCESS) {
165 ConsoleMessage(
"Error: Failed to build program executable\n");
166 printCLError(
"clBuildProgram",err);
167 err = clGetProgramBuildInfo(initme->CollideGPU_program, p->CL_device_id, CL_PROGRAM_BUILD_LOG,
168 sizeof(buffer), buffer, &len);
169 TEST_ERR(
"clGetProgramBuildInfo",err);
170 ConsoleMessage (
"error string len %d\n",(
int)len);
171 ConsoleMessage(
"%s\n", buffer);
176 initme->CollideGPU_kernel = clCreateKernel(initme->CollideGPU_program,
"compute_collide", &err);
179 if (!initme->CollideGPU_kernel || (err != CL_SUCCESS)) {
180 printCLError(
"clCreateKernel",err);
187 err = clGetKernelWorkGroupInfo (initme->CollideGPU_kernel, p->CL_device_id,
188 CL_KERNEL_WORK_GROUP_SIZE,
sizeof(
size_t), &kernel_wg_size, &rvlen);
190 if (err!=CL_SUCCESS) {
191 printCLError(
"clGetKernelWorkGroupInfo",err);
196 if (kernel_wg_size < p->CL_default_workgroup_size) initme->CollideGPU_workgroup_size = kernel_wg_size;
197 else initme->CollideGPU_workgroup_size = p->CL_default_workgroup_size;
200 ConsoleMessage (
"MAX_WORK_GROUP_SIZE %d\n",kernel_wg_size);
201 ConsoleMessage (
"We are going to set our workgroup size to %d\n",wg_size);
221 int printedOnce = TRUE;
224 struct point_XYZ run_non_walk_collide_program(GLuint vertex_vbo, GLuint index_vbo, float *modelMat,
int ntri,
225 int face_ccw,
int face_flags,
float avatar_radius) {
229 size_t local_work_size;
230 size_t global_work_size;
233 double maxdisp = 0.0;
234 struct point_XYZ dispv, maxdispv = {0,0,0};
236 struct sCollisionGPU* me = GPUCollisionInfo();
240 p = (ppOpenCL_Utils)tg->OpenCL_Utils.prv;
243 if (me->CollideGPU_returnValues.n < ntri) {
244 if (me->CollideGPU_returnValues.n != 0) {
245 err = clReleaseMemObject(me->CollideGPU_output_buffer);
246 TEST_ERR(
"clReleaseMemObject",err);
249 me->CollideGPU_output_buffer = clCreateBuffer(p->CL_context, CL_MEM_WRITE_ONLY,
sizeof(
struct SFColorRGBA) * ntri,
252 if (me->CollideGPU_matrix_buffer == NULL) {
253 me->CollideGPU_matrix_buffer = clCreateBuffer(p->CL_context, CL_MEM_READ_ONLY, sizeof (cl_float16), NULL, NULL);
256 if (!(me->CollideGPU_output_buffer) || !(me->CollideGPU_matrix_buffer)) {
257 printCLError(
"clCreateBuffer",10000);
260 me->CollideGPU_output_size = ntri;
261 me->CollideGPU_returnValues.p = REALLOC(me->CollideGPU_returnValues.p,
sizeof(
struct SFColorRGBA) *ntri);
262 me->CollideGPU_returnValues.n = ntri;
266 err = clEnqueueWriteBuffer(p->CL_queue, me->CollideGPU_matrix_buffer, CL_TRUE, 0,
sizeof(cl_float16), modelMat, 0, NULL, NULL);
267 TEST_ERR(
"clEnqueueWriteBuffer",err);
270 me->CollideGPU_vertex_buffer=clCreateFromGLBuffer(p->CL_context, CL_MEM_READ_ONLY, vertex_vbo, &err);
271 if (err != CL_SUCCESS) {
272 printCLError(
"clCreateFromGLBuffer",err);
277 me->CollideGPU_index_buffer = clCreateFromGLBuffer(p->CL_context, CL_MEM_READ_ONLY, index_vbo, &err);
278 if (err != CL_SUCCESS) {
279 printCLError(
"clCreateFromGLBuffer",err);
284 count = (
unsigned int) ntri;
286 err = clSetKernelArg(me->CollideGPU_kernel, 0,
sizeof(cl_mem), &me->CollideGPU_output_buffer);
287 TEST_ERR(
"clSetKernelArg",err);
289 err =clSetKernelArg(me->CollideGPU_kernel, 1,
sizeof(
unsigned int), &count);
290 TEST_ERR(
"clSetKernelArg",err);
292 err =clSetKernelArg(me->CollideGPU_kernel, 2, sizeof (cl_mem), &me->CollideGPU_matrix_buffer);
293 TEST_ERR(
"clSetKernelArg",err);
295 err =clSetKernelArg(me->CollideGPU_kernel, 3, sizeof (cl_mem), &me->CollideGPU_vertex_buffer);
296 TEST_ERR(
"clSetKernelArg",err);
298 err =clSetKernelArg(me->CollideGPU_kernel, 4, sizeof (cl_mem), &me->CollideGPU_index_buffer);
299 TEST_ERR(
"clSetKernelArg",err);
301 err =clSetKernelArg(me->CollideGPU_kernel, 5,
sizeof(
int), &face_ccw);
302 TEST_ERR(
"clSetKernelArg",err);
304 err =clSetKernelArg(me->CollideGPU_kernel, 6,
sizeof(
int), &face_flags);
305 TEST_ERR(
"clSetKernelArg",err);
307 err =clSetKernelArg(me->CollideGPU_kernel, 7,
sizeof(
int), &avatar_radius);
308 TEST_ERR(
"clSetKernelArg",err);
310 err =clSetKernelArg(me->CollideGPU_kernel, 8,
sizeof(
int), &ntri);
311 TEST_ERR(
"clSetKernelArg",err);
314 #define MYWG (me->CollideGPU_workgroup_size)
317 global_work_size = (size_t) (ntri) / MYWG;
318 else global_work_size = 0;
321 global_work_size += 1;
324 global_work_size *= MYWG;
328 local_work_size = MYWG;
333 cl_context myContext;
334 cl_device_id myDevice;
338 err = clGetCommandQueueInfo(p->CL_queue, CL_QUEUE_CONTEXT,
sizeof(myContext), &myContext, NULL);
339 printf (
"queue context when commandqueue created %p, should be %p\n", myContext,p->CL_context);
340 err = clGetCommandQueueInfo(p->CL_queue, CL_QUEUE_DEVICE,
sizeof(myDevice), &myDevice, NULL);
341 printf (
"queue Device when commandqueue created %p, should be %p\n", myDevice,p->CL_device_id);
342 err = clGetCommandQueueInfo(p->CL_queue, CL_QUEUE_REFERENCE_COUNT,
sizeof(myReference), &myReference, NULL);
343 printf (
"queue Reference when commandqueue created %d\n", myReference);
347 err = clEnqueueNDRangeKernel(p->CL_queue, me->CollideGPU_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
349 if (err != CL_SUCCESS) {
350 printCLError(
"clEnqueueNDRangeKernel",err);
358 err = clFlush(p->CL_queue);
359 if (err != CL_SUCCESS) {
360 printCLError(
"clFlush",err);
364 err = clFinish(p->CL_queue);
365 if (err != CL_SUCCESS) {
366 printCLError(
"clFinish",err);
381 err = clEnqueueReadBuffer (p->CL_queue, me->CollideGPU_output_buffer,
383 me->CollideGPU_returnValues.p, 0, NULL, NULL);
385 if (err != CL_SUCCESS) {
386 printCLError(
"clEnqueueReadBuffer",err);
391 for (i=0; i < ntri; i++) {
401 if (me->CollideGPU_returnValues.p[i].c[3] > 1.0) {
405 dispv.x = me->CollideGPU_returnValues.p[i].c[0];
406 dispv.y = me->CollideGPU_returnValues.p[i].c[1];
407 dispv.z = me->CollideGPU_returnValues.p[i].c[2];
415 disp = vecdot (&dispv,&dispv);
416 if ((disp > FLOAT_TOLERANCE) && (disp>maxdisp)) {
430 #ifdef GL_ES_VERSION_2_0
431 static const char* collide_non_walk_kernel_headers =
" \
432 //#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n\
433 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable \n\
434 //#pragma OPENCL EXTENSION CL_APPLE_gl_sharing : enable \n\
435 //#pragma OPENCL EXTENSION CL_KHR_gl_sharing : enable \n\
436 //#pragma OPENCL EXTENSION cl_khr_select_fprounding_mode : enable \n\
451 static const char* collide_non_walk_kernel_headers =
" \
452 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable \n\
460 static const char* collide_non_walk_kernel =
" \
462 /********************************************************************************/ \n\
464 /* Collide kernel for fly and examine modes */ \n\
466 /********************************************************************************/ \n\
467 /* Function prototypes */ \n\
468 float4 closest_point_on_plane(float4 point_a, float4 point_b, float4 point_c); \n\
470 /* start the collide process. \n\
472 1) transform the vertex. \n\
473 2) calculate normal \n\
474 3) if triangle is visible to us, get ready for collide calcs \n\
479 #define DOUGS_FLOAT_TOLERANCE 0.00000001 \n\
480 #define FLOAT_TOLERANCE 0.0000001f \n\
481 #define PR_DOUBLESIDED 0x01 \n\
482 #define PR_FRONTFACING 0x02 /* overrides effect of doublesided. */ \n\
483 #define PR_BACKFACING 0x04 /* overrides effect of doublesided, all normals are reversed. */ \n\
485 /********************************************************************************/ \n\
488 #define APPROX (a, b) (fabs(a-b) < FLOAT_TOLERANCE) \n\
489 #define VECSCALE(v,s) (float4)(v.x*s, v.y*s, v.z*s, 0.0f) \n\
490 #define VECLENGTH(v) (float)sqrt((float)dot((float4)v,(float4)v)) \n\
494 /********************************************************************************/ \n\
496 /* Three vertices; find the closest one which intersects the Z plane; */ \n\
497 /* either we choose a Vertex, on an edge, or fabricate one in the */ \n\
498 /* middle of the triangle somewhere. */ \n\
500 /* Adapted from \"Real time Collision Detection\", Christer Ericson. */ \n\
502 /********************************************************************************/ \n\
505 float4 closest_point_on_plane(float4 point_a, float4 point_b, float4 point_c) { \n\
506 float4 vector_ab = (point_b - point_a); // b - a \n\
507 float4 vector_ac = (point_c - point_a); // c - a \n\
508 float4 vector_bc = (point_c - point_b); // c - b \n\
509 float4 vector_ba = (point_a - point_b); // a - b \n\
510 float4 vector_ca = (point_a - point_c); // a - c \n\
511 float4 vector_cb = (point_b - point_c); // b - c \n\
514 // we have moved points, so our bounding sphere is at (0,0,0) so p = (0,0,0) \n\
515 float4 vector_ap = point_a * (float4)(-1.0f, -1.0f, -1.0f, -1.0f); // p - a \n\
516 float4 vector_bp = point_b * (float4)(-1.0f, -1.0f, -1.0f, -1.0f); // p - b \n\
517 float4 vector_cp = point_c * (float4)(-1.0f, -1.0f, -1.0f, -1.0f); // p - c \n\
518 #define vector_pa point_a /* a - p */ \n\
519 #define vector_pb point_b /* b - p */ \n\
520 #define vector_pc point_c /* c - p */ \n\
522 // Step 2. Compute parametric position s for projection P' of P on AB, \n\
523 // P' = A + s*AB, s = snom/(snom+sdenom) \n\
525 float snom = dot(vector_ap, vector_ab); // (p - a, ab); \n\
526 float sdenom = dot(vector_bp, vector_ba); // (p - b, a - b); \n\
529 // Compute parametric position t for projection P' of P on AC, \n\
530 // P' = A + t*AC, s = tnom/(tnom+tdenom) \n\
531 float tnom = dot(vector_ap, vector_ac); // (p - a, ac); \n\
532 float tdenom = dot(vector_cp, vector_ca); // (p - c, a - c); \n\
535 if (snom <= 0.0f && tnom <= 0.0f) { \n\
540 // Compute parametric position u for projection P' of P on BC, \n\
541 // P' = B + u*BC, u = unom/(unom+udenom) \n\
542 float unom = dot(vector_bp, vector_bc); //(p - b, bc) \n\
543 float udenom = dot(vector_cp, vector_cb); // (p - c, b - c); \n\
546 if (sdenom <= 0.0f && unom <= 0.0f) { \n\
550 if (tdenom <= 0.0f && udenom <= 0.0f) { \n\
556 // P is outside (or on) AB if the triple scalar product [N PA PB] <= 0 \n\
561 n = cross(vector_ab, vector_ac); // (b - a, c - a); \n\
562 tmp = cross(vector_pa, vector_pb); // veccross (a-p, b-p); \n\
564 // vc = dot(n, veccross(a - p, b - p)); \n\
565 vc = dot(n, tmp); \n\
568 // If P outside AB and within feature region of AB, \n\
569 // return projection of P onto AB \n\
570 if (vc <= 0.0f && snom >= 0.0f && sdenom >= 0.0f) { \n\
571 return point_a + snom / (snom + sdenom) * vector_ab; \n\
577 // P is outside (or on) BC if the triple scalar product [N PB PC] <= 0 \n\
578 tmp = cross (vector_pb, vector_pc); \n\
580 float va = dot(n, tmp); // Cross(b - p, c - p)); \n\
582 // If P outside BC and within feature region of BC, \n\
583 // return projection of P onto BC \n\
584 if (va <= 0.0f && unom >= 0.0f && udenom >= 0.0f) { \n\
585 return point_b + unom / (unom + udenom) * vector_bc; \n\
589 // P is outside (or on) CA if the triple scalar product [N PC PA] <= 0 \n\
590 tmp = cross (vector_pc, vector_pa); \n\
592 float vb = dot(n, tmp); // Cross(c - p, a - p)); \n\
593 // If P outside CA and within feature region of CA, \n\
594 // return projection of P onto CA \n\
595 if (vb <= 0.0f && tnom >= 0.0f && tdenom >= 0.0f) { \n\
596 return point_a + tnom / (tnom + tdenom) * vector_ac; \n\
600 // P must project inside face region. Compute Q using barycentric coordinates \n\
601 float u = va / (va + vb + vc); \n\
602 float v = vb / (va + vb + vc); \n\
603 float w = 1.0f - u - v; // = vc / (va + vb + vc) \n\
604 float4 u4 = (float4)(u); \n\
605 float4 v4 = (float4)(v); \n\
606 float4 w4 = (float4)(w); \n\
608 //return u * point_a + v * point_b + w * point_c; \n\
609 float4 rv = mad(point_a,u4,mad(point_b,v4,point_c*w4)); \n\
613 /********************************************************************************/ \n\
615 __kernel void compute_collide ( \n\
616 __global float4 *output, /* 0 */ \n\
617 const unsigned int count, /* 1 */ \n\
618 __global float *mymat, /* 2 */ \n\
619 __global float *my_vertex, /* 3 */ \n\
620 __global short *my_cindex, /* 4 */ \n\
621 const int face_ccw, /* 5 */ \n\
622 const int face_flags, /* 6 */ \n\
623 const float avatar_radius, /* 7 */ \n\
624 const int ntri /* 8 */ \n\
627 /* which index this instantation is working on */ \n\
628 int i_am_canadian = get_global_id(0); \n\
629 if (i_am_canadian >= ntri) return; /* allows for workgroup size sizes */ \n\
631 /* vertices for this triangle */ \n\
632 /* transformed by matrix */ \n\
637 /* starting index in my_vertex of this vertex */ \n\
638 /* we work in triangles; each triangle has 3 vertices */ \n\
639 #define COORD_1 (my_cindex[i_am_canadian*3+0]*3) \n\
640 #define COORD_2 (my_cindex[i_am_canadian*3+1]*3) \n\
641 #define COORD_3 (my_cindex[i_am_canadian*3+2]*3) \n\
643 /* do matrix transform, 4 floats wide. */ \n\
644 float4 matColumn1 = (float4)(convert_float(mymat[0]),convert_float(mymat[1]),convert_float(mymat[2]),0.0f); \n\
645 float4 matColumn2 = (float4)(convert_float(mymat[4]),convert_float(mymat[5]),convert_float(mymat[6]),0.0f); \n\
646 float4 matColumn3 = (float4)(convert_float(mymat[8]),convert_float(mymat[9]),convert_float(mymat[10]),0.0f); \n\
647 float4 matColumn4 = (float4)(convert_float(mymat[12]),convert_float(mymat[13]),convert_float(mymat[14]),0.0f); \n\
649 /* first vertex */ \n\
650 float4 Vertex_X = (float4)(my_vertex[COORD_1+0]); \n\
651 float4 Vertex_Y = (float4)(my_vertex[COORD_1+1]); \n\
652 float4 Vertex_Z = (float4)(my_vertex[COORD_1+2]); \n\
653 tv1 = mad(matColumn1,Vertex_X,mad(matColumn2,Vertex_Y,mad(matColumn3,Vertex_Z,matColumn4))); \n\
655 /* second vertex */ \n\
656 Vertex_X = (float4)(my_vertex[COORD_2+0]); \n\
657 Vertex_Y = (float4)(my_vertex[COORD_2+1]); \n\
658 Vertex_Z = (float4)(my_vertex[COORD_2+2]); \n\
659 tv2 = mad(matColumn1,Vertex_X,mad(matColumn2,Vertex_Y,mad(matColumn3,Vertex_Z,matColumn4))); \n\
661 /* third vertex */ \n\
662 Vertex_X = (float4)(my_vertex[COORD_3+0]); \n\
663 Vertex_Y = (float4)(my_vertex[COORD_3+1]); \n\
664 Vertex_Z = (float4)(my_vertex[COORD_3+2]); \n\
665 tv3 = mad(matColumn1,Vertex_X,mad(matColumn2,Vertex_Y,mad(matColumn3,Vertex_Z,matColumn4))); \n\
668 /* calculate normal for face from transformed vertices */ \n\
669 /* this replicates polynormalf for opencl */ \n\
670 #define VEC_DIST_1 (tv2-tv1) \n\
671 #define VEC_DIST_2 (tv3-tv1) \n\
672 float4 norm = normalize(cross(VEC_DIST_1,VEC_DIST_2)); \n\
674 /* from polyrep_disp_rec2, see that function for full comments */ \n\
675 bool frontfacing; \n\
677 /* how we view it from the avatar */ \n\
678 if (face_ccw) frontfacing = (dot(norm,tv1) < 0); \n\
679 else frontfacing = (dot(norm,tv1) >= 0); \n\
681 /* now, is solid false, or ccw or ccw winded triangle? */ \n\
682 /* if we should do this triangle, the if statement is true */ \n\
684 bool should_do_this_triangle = \n\
685 ((frontfacing && !(face_flags & PR_DOUBLESIDED) ) \n\
686 || ( (face_flags & PR_DOUBLESIDED) && !(face_flags & (PR_FRONTFACING | PR_BACKFACING) ) ) \n\
687 || (frontfacing && (face_flags & PR_FRONTFACING)) \n\
688 || (!frontfacing && (face_flags & PR_BACKFACING)) ); \n\
691 if (!should_do_this_triangle) { \n\
692 output[i_am_canadian] = (float4)(0.0f,0.0f,0.0f,0.0f); \n\
698 /* if we are down to here, we have to do this triangle */ \n\
700 if(!frontfacing) { /*can only be here in DoubleSided mode*/ \n\
701 /*reverse polygon orientation, and do calculations*/ \n\
702 norm = VECSCALE(norm,-1.0f); \n\
705 /********************************************************************************/ \n\
707 /* Collide Kernel Step 2: do hit calculations */ \n\
708 /* replicate Dougs get_poly_min_disp_with_sphere function */ \n\
710 /********************************************************************************/ \n\
712 float4 closest_point = closest_point_on_plane(tv1,tv2,tv3); \n\
714 float get_poly_mindisp = dot(closest_point,closest_point); \n\
716 if (get_poly_mindisp > (avatar_radius * avatar_radius)) { \n\
717 output[i_am_canadian] = (float4)(0.0f,0.0f,0.0f,0.0f); \n\
721 /* do we have a movement here? */ \n\
722 if (VECLENGTH(closest_point) > FLOAT_TOLERANCE) { \n\
723 float poly_min_rt = sqrt(get_poly_mindisp); \n\
724 float sFactor = (avatar_radius -poly_min_rt) /VECLENGTH(closest_point); \n\
726 float4 result = VECSCALE(closest_point,sFactor); \n\
727 /* copy over the result */ \n\
728 result.w = 100.0f; /* flag that this is a good one */ \n\
729 output[i_am_canadian] = result; \n\
734 /* if we are down to here, we can just return zero */ \n\
735 output[i_am_canadian] = (float4)(0.0f,0.0f,0.0f,0.0f); \n\