FreeWRL/FreeX3D  3.0.0
CollisionGPU.c
1 /*
2 
3 
4 Render the children of nodes.
5 
6 */
7 
8 /****************************************************************************
9  This file is part of the FreeWRL/FreeX3D Distribution.
10 
11  Copyright 2009 CRC Canada. (http://www.crc.gc.ca)
12 
13  FreeWRL/FreeX3D is free software: you can redistribute it and/or modify
14  it under the terms of the GNU Lesser Public License as published by
15  the Free Software Foundation, either version 3 of the License, or
16  (at your option) any later version.
17 
18  FreeWRL/FreeX3D is distributed in the hope that it will be useful,
19  but WITHOUT ANY WARRANTY; without even the implied warranty of
20  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21  GNU General Public License for more details.
22 
23  You should have received a copy of the GNU General Public License
24  along with FreeWRL/FreeX3D. If not, see <http://www.gnu.org/licenses/>.
25 ****************************************************************************/
26 
27 
28 #include <config.h>
29 #include <system.h>
30 #include <display.h>
31 #include <internal.h>
32 
33 #include <libFreeWRL.h>
34 
35 #include "Viewer.h"
36 #include "RenderFuncs.h"
37 #include "../vrml_parser/Structs.h"
38 
39 #include "../main/headers.h"
40 
41 #include "LinearAlgebra.h"
42 #include "Collision.h"
43 // OLD_IPHONE_AQUA #if !defined(_ANDROID) && !defined(IPHONE)
44 #if !defined(_ANDROID)
45 #include "../opencl/OpenCL_Utils.h"
46 #endif
47 #ifdef HAVE_OPENCL
48 
49 static const char* collide_non_walk_kernel;
50 static const char* collide_non_walk_kernel_headers;
51 
52 #define FLOAT_TOLERANCE 0.00000001
53 
54 /********************************************************************************/
55 /* */
56 /* Collide kernel, generic structures, etc */
57 /* */
58 /********************************************************************************/
59 
60 /********************************************************************************/
61 /* */
62 /* */
63 /********************************************************************************/
64 #ifdef _MSC_VER
65 cl_platform_id cpPlatform = NULL; // OpenCL platform
66 cl_device_id* cdDevices = NULL; // device list
67 cl_uint uiTargetDevice = 0; // Default Device to compute on
68 
69 
70 
71 cl_int ciErrNum; // Error code var
72 enum LOGMODES
73 {
74  LOGCONSOLE = 1, // bit to signal "log to console"
75  LOGFILE = 2, // bit to signal "log to file"
76  LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
77  APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
78  MASTER = 8, // bit to signal master .csv log output
79  ERRORMSG = 16, // bit to signal "pre-pend Error"
80  CLOSELOG = 32 // bit to close log file, if open, after any requested file write
81 };
82 int bQATest = 0; // false = normal GL loop, true = run No-GL test sequence
83 bool bGLinteropSupported = false; // state var for GL interop supported or not
84 cl_uint uiNumDevsUsed = 1; // Number of devices used in this sample
85 bool bGLinterop = false; // state var for GL interop or not
86 /*
87 void Cleanup(int iExitCode)
88 {
89  // Cleanup allocated objects
90  //shrLog("\nStarting Cleanup...\n\n");
91  if(program)clReleaseProgram(program);
92  if(context)clReleaseContext(context);
93  if(cdDevices)free(cdDevices);
94 
95  // Cleanup GL objects if used
96  if (!bQATest)
97  {
98  //DeInitGL();
99  }
100 
101  // finalize logs and leave
102  //shrLog("%s\n\n", iExitCode == 0 ? "PASSED" : "FAILED");
103  if ((bQATest))
104  {
105  // shrLogEx(LOGBOTH | CLOSELOG, 0, "oclBoxFilter.exe Exiting...\n");
106  }
107  else
108  {
109  //shrLogEx(LOGBOTH | CLOSELOG, 0, "oclBoxFilter.exe Exiting...\nPress <Enter> to Quit\n");
110  #ifdef WIN32
111  getchar();
112  #endif
113  }
114  exit (iExitCode);
115 }
116 void (*pCleanup)(int) = &Cleanup;
117 */
118 #define shrLog printf
119 #endif //_MSC_VER
120 
121 
122 
123 
124 bool collision_initGPUCollide (struct sCollisionGPU* initme) {
125  ppOpenCL_Utils p;
126  ttglobal tg = gglobal();
127  p = (ppOpenCL_Utils)tg->OpenCL_Utils.prv;
128 
129  char *kp[2];
130  cl_int err;
131  size_t kernel_wg_size;
132  size_t rvlen;
133 
134  //printf ("called initGPUCollide, p is %p\n",p);
135  //printf ("called initGPUCollide, context is %p\n",p->CL_context);
136  //printf ("called initGPUCollide, initme is %p\n",initme);
137 
138  kp[0] = (char *)collide_non_walk_kernel_headers;
139  kp[1] = (char *)collide_non_walk_kernel;
140 
141  initme->CollideGPU_program = clCreateProgramWithSource(p->CL_context, 2, (const char **) kp, NULL, &err);
142  //printf ("past the clCreateProgramWithSource call\n");
143 
144  if (!initme->CollideGPU_program || (err != CL_SUCCESS)) {
145  printCLError("clCreateProgramWithSource",err);
146  return FALSE;
147  }
148 
149  // build the compute program executable
150  //char *opts = "-Werror -cl-single-precision-constant -cl-nv-verbose -g -cl-opt-disable -cl-strict-aliasing";
151  //char *opts = "-Werror -cl-single-precision-constant -cl-opt-disable -cl-strict-aliasing";
152  //err = clBuildProgram(p->program, 0, NULL, opts, NULL, NULL);
153  //ConsoleMessage ("calling clBuildProgram with program %p\n",p->program);
154 
155  // build the program, hard code in devices to 1 device, with the device list, no options
156  char *opts = NULL;
157  err = clBuildProgram(initme->CollideGPU_program, 1, &(p->CL_device_id), opts, NULL, NULL);
158  //printf ("past the clBuildProgram call\n");
159 
160  //ConsoleMessage ("called clBuildProgram error %d\n",err);
161  if (err != CL_SUCCESS) {
162  size_t len;
163  char buffer[16384];
164 
165  ConsoleMessage("Error: Failed to build program executable\n");
166  printCLError("clBuildProgram",err);
167  err = clGetProgramBuildInfo(initme->CollideGPU_program, p->CL_device_id, CL_PROGRAM_BUILD_LOG,
168  sizeof(buffer), buffer, &len);
169  TEST_ERR("clGetProgramBuildInfo",err);
170  ConsoleMessage ("error string len %d\n",(int)len);
171  ConsoleMessage("%s\n", buffer);
172  return FALSE;
173  }
174 
175  // create the compute kernel
176  initme->CollideGPU_kernel = clCreateKernel(initme->CollideGPU_program, "compute_collide", &err);
177  // printf ("kernel is %p\n",initme->CollideGPU_kernel);
178 
179  if (!initme->CollideGPU_kernel || (err != CL_SUCCESS)) {
180  printCLError("clCreateKernel",err);
181  return FALSE;
182  }
183 
184 
185  // Kernel Workgroup size
186  // rv = clGetDeviceInfo (p->device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, &rvlen);
187  err = clGetKernelWorkGroupInfo (initme->CollideGPU_kernel, p->CL_device_id,
188  CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernel_wg_size, &rvlen);
189 
190  if (err!=CL_SUCCESS) {
191  printCLError( "clGetKernelWorkGroupInfo",err);
192  return FALSE;
193  }
194 
195  // try the smaller of the two
196  if (kernel_wg_size < p->CL_default_workgroup_size) initme->CollideGPU_workgroup_size = kernel_wg_size;
197  else initme->CollideGPU_workgroup_size = p->CL_default_workgroup_size;
198 
199  #ifdef GPU_DEBUG
200  ConsoleMessage ("MAX_WORK_GROUP_SIZE %d\n",kernel_wg_size);
201  ConsoleMessage ("We are going to set our workgroup size to %d\n",wg_size);
202 
203 
204 /*
205 1. Get workGroupSize from clGetDeviceInfo with CL_DEVICE_mum of two values and use that value as your optimal workGroupSize
206 2. Get KernelWorkGroupSize from from clGetKernelWorkGroupInfo with CL_KERNEL_WORK_GPOUP_SIZE
207 3. Get minimum of two values and use that value as your optimal workGroupSize
208 */
209 
210  #endif // GPU_DEBUG
211 
212  return TRUE;
213 }
214 
215 
216 /********************************************************************************/
217 /* */
218 /* */
219 /********************************************************************************/
220 
221 int printedOnce = TRUE;
222 
223 
224 struct point_XYZ run_non_walk_collide_program(GLuint vertex_vbo, GLuint index_vbo, float *modelMat,int ntri,
225  int face_ccw, int face_flags, float avatar_radius) {
226 
227  int i;
228  cl_int err;
229  size_t local_work_size;
230  size_t global_work_size;
231  unsigned int count;
232 
233  double maxdisp = 0.0;
234  struct point_XYZ dispv, maxdispv = {0,0,0};
235 
236  struct sCollisionGPU* me = GPUCollisionInfo();
237 
238  ppOpenCL_Utils p;
239  ttglobal tg = gglobal();
240  p = (ppOpenCL_Utils)tg->OpenCL_Utils.prv;
241 
242  // enough space for rv?
243  if (me->CollideGPU_returnValues.n < ntri) {
244  if (me->CollideGPU_returnValues.n != 0) {
245  err = clReleaseMemObject(me->CollideGPU_output_buffer);
246  TEST_ERR("clReleaseMemObject",err);
247  }
248 
249  me->CollideGPU_output_buffer = clCreateBuffer(p->CL_context, CL_MEM_WRITE_ONLY, sizeof(struct SFColorRGBA) * ntri,
250  NULL, NULL);
251 
252  if (me->CollideGPU_matrix_buffer == NULL) {
253  me->CollideGPU_matrix_buffer = clCreateBuffer(p->CL_context, CL_MEM_READ_ONLY, sizeof (cl_float16), NULL, NULL);
254  }
255 
256  if (!(me->CollideGPU_output_buffer) || !(me->CollideGPU_matrix_buffer)) {
257  printCLError("clCreateBuffer",10000);
258  }
259 
260  me->CollideGPU_output_size = ntri;
261  me->CollideGPU_returnValues.p = REALLOC(me->CollideGPU_returnValues.p, sizeof(struct SFColorRGBA) *ntri);
262  me->CollideGPU_returnValues.n = ntri;
263  }
264 
265  // update the current matrix transform
266  err = clEnqueueWriteBuffer(p->CL_queue, me->CollideGPU_matrix_buffer, CL_TRUE, 0, sizeof(cl_float16), modelMat, 0, NULL, NULL);
267  TEST_ERR("clEnqueueWriteBuffer",err);
268 
269  // lets get the openGL vertex buffer here
270  me->CollideGPU_vertex_buffer=clCreateFromGLBuffer(p->CL_context, CL_MEM_READ_ONLY, vertex_vbo, &err);
271  if (err != CL_SUCCESS) {
272  printCLError("clCreateFromGLBuffer",err);
273  return maxdispv;
274  }
275 
276  // and the coordinate index buffer
277  me->CollideGPU_index_buffer = clCreateFromGLBuffer(p->CL_context, CL_MEM_READ_ONLY, index_vbo, &err);
278  if (err != CL_SUCCESS) {
279  printCLError("clCreateFromGLBuffer",err);
280  return maxdispv;
281  }
282 
283  // set the args values
284  count = (unsigned int) ntri;
285 
286  err = clSetKernelArg(me->CollideGPU_kernel, 0, sizeof(cl_mem), &me->CollideGPU_output_buffer);
287  TEST_ERR("clSetKernelArg",err);
288 
289  err =clSetKernelArg(me->CollideGPU_kernel, 1, sizeof(unsigned int), &count);
290  TEST_ERR("clSetKernelArg",err);
291 
292  err =clSetKernelArg(me->CollideGPU_kernel, 2, sizeof (cl_mem), &me->CollideGPU_matrix_buffer);
293  TEST_ERR("clSetKernelArg",err);
294 
295  err =clSetKernelArg(me->CollideGPU_kernel, 3, sizeof (cl_mem), &me->CollideGPU_vertex_buffer);
296  TEST_ERR("clSetKernelArg",err);
297 
298  err =clSetKernelArg(me->CollideGPU_kernel, 4, sizeof (cl_mem), &me->CollideGPU_index_buffer);
299  TEST_ERR("clSetKernelArg",err);
300 
301  err =clSetKernelArg(me->CollideGPU_kernel, 5, sizeof(int), &face_ccw);
302  TEST_ERR("clSetKernelArg",err);
303 
304  err =clSetKernelArg(me->CollideGPU_kernel, 6, sizeof(int), &face_flags);
305  TEST_ERR("clSetKernelArg",err);
306 
307  err =clSetKernelArg(me->CollideGPU_kernel, 7, sizeof(int), &avatar_radius);
308  TEST_ERR("clSetKernelArg",err);
309 
310  err =clSetKernelArg(me->CollideGPU_kernel, 8, sizeof(int), &ntri);
311  TEST_ERR("clSetKernelArg",err);
312 
313  // global work group size
314  #define MYWG (me->CollideGPU_workgroup_size)
315  // find out how many "blocks" we can have
316  if (MYWG > 0)
317  global_work_size = (size_t) (ntri) / MYWG;
318  else global_work_size = 0;
319 
320  // add 1 to it, because we have to round up
321  global_work_size += 1;
322 
323  // now, global_work_size will be an exact multiple of local_work_size
324  global_work_size *= MYWG;
325 
326  //ConsoleMessage ("global_work_size is %d %x right now...\n",global_work_size, global_work_size);
327 
328  local_work_size = MYWG;
329  //ConsoleMessage ("local_work_size %d\n",local_work_size);
330  //ConsoleMessage ("ntri %d, global_work_size %d, local_work_size %d\n",ntri,global_work_size,local_work_size);
331 
332 if (!printedOnce) {
333  cl_context myContext;
334  cl_device_id myDevice;
335  cl_uint myReference;
336 
337 
338  err = clGetCommandQueueInfo(p->CL_queue, CL_QUEUE_CONTEXT, sizeof(myContext), &myContext, NULL);
339 printf ("queue context when commandqueue created %p, should be %p\n", myContext,p->CL_context);
340  err = clGetCommandQueueInfo(p->CL_queue, CL_QUEUE_DEVICE, sizeof(myDevice), &myDevice, NULL);
341 printf ("queue Device when commandqueue created %p, should be %p\n", myDevice,p->CL_device_id);
342  err = clGetCommandQueueInfo(p->CL_queue, CL_QUEUE_REFERENCE_COUNT, sizeof(myReference), &myReference, NULL);
343 printf ("queue Reference when commandqueue created %d\n", myReference);
344 
345 }
346 
347  err = clEnqueueNDRangeKernel(p->CL_queue, me->CollideGPU_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
348 if (!printedOnce) {
349  if (err != CL_SUCCESS) {
350  printCLError("clEnqueueNDRangeKernel",err);
351  return maxdispv;
352  }
353 printedOnce = TRUE;
354 }
355 
356 #ifdef TRY_FLUSH
357  // wait for things to finish
358  err = clFlush(p->CL_queue);
359  if (err != CL_SUCCESS) {
360  printCLError("clFlush",err);
361  return maxdispv;
362  }
363 
364  err = clFinish(p->CL_queue);
365  if (err != CL_SUCCESS) {
366  printCLError("clFinish",err);
367  return maxdispv;
368  }
369 #endif
370 
371  // get the data
372 
373  // get the data
374  /* working code has:
375  err = clEnqueueReadBuffer (me->queue, me->output_buffer,
376  CL_TRUE, 0, sizeof(struct SFColorRGBA) * ntri,
377  me->collide_rvs.p, 0, NULL, NULL);
378 */
379 
380 
381  err = clEnqueueReadBuffer (p->CL_queue, me->CollideGPU_output_buffer,
382  CL_TRUE, 0, sizeof(struct SFColorRGBA) * ntri,
383  me->CollideGPU_returnValues.p, 0, NULL, NULL);
384 
385  if (err != CL_SUCCESS) {
386  printCLError("clEnqueueReadBuffer",err);
387  return maxdispv;
388  }
389 
390 
391  for (i=0; i < ntri; i++) {
392  /* XXX float to double conversion; make a vecdotf for speed */
393  double disp;
394  /* printf ("i %d rv %f %f %f %f\n",i,me->CollideGPU_returnValues.p[i].c[0],
395  me->CollideGPU_returnValues.p[i].c[1],me->CollideGPU_returnValues.p[i].c[2],
396  me->CollideGPU_returnValues.p[i].c[3]);
397  */
398  // we use the last float to indicate whether to bother here; saves us
399  // doing unneeded calculations here
400 
401  if (me->CollideGPU_returnValues.p[i].c[3] > 1.0) {
402  //ConsoleMessage ("possibly triangle %d has some stuff for us\n",i);
403 
404 
405  dispv.x = me->CollideGPU_returnValues.p[i].c[0];
406  dispv.y = me->CollideGPU_returnValues.p[i].c[1];
407  dispv.z = me->CollideGPU_returnValues.p[i].c[2];
408  //ConsoleMessage ("GPU collide tri %d, disp %f %f %f\n",i,dispv.x,dispv.y,dispv.z);
409 
410  /*keep result only if:
411  displacement is positive
412  displacement is smaller than minimum displacement up to date
413  */
414 
415  disp = vecdot (&dispv,&dispv);
416  if ((disp > FLOAT_TOLERANCE) && (disp>maxdisp)) {
417  maxdisp = disp;
418  maxdispv = dispv;
419  }
420  }
421 
422  }
423 
424 
425  //ConsoleMessage ("OpenCL ntri %d - at end of opencl, maxdispv %f %f %f\n",ntri, maxdispv.x, maxdispv.y, maxdispv.z);
426 
427  return maxdispv;
428 }
429 
430 #ifdef GL_ES_VERSION_2_0
431 static const char* collide_non_walk_kernel_headers = " \
432 //#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n\
433 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable \n\
434 //#pragma OPENCL EXTENSION CL_APPLE_gl_sharing : enable \n\
435 //#pragma OPENCL EXTENSION CL_KHR_gl_sharing : enable \n\
436 //#pragma OPENCL EXTENSION cl_khr_select_fprounding_mode : enable \n\
437 ";
438 #else
439 
440 // OLD_IPHONE_AQUA #if defined (TARGET_AQUA)
441 // OLD_IPHONE_AQUA static const char* collide_non_walk_kernel_headers = " \
442 // OLD_IPHONE_AQUA //#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n\
443 // OLD_IPHONE_AQUA #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable \n\
444 // OLD_IPHONE_AQUA #pragma OPENCL EXTENSION CL_APPLE_gl_sharing : enable \n\
445 // OLD_IPHONE_AQUA #pragma OPENCL EXTENSION CL_KHR_gl_sharing : enable \n\
446 // OLD_IPHONE_AQUA #pragma OPENCL EXTENSION cl_khr_select_fprounding_mode : enable \n\
447 // OLD_IPHONE_AQUA ";
448 // OLD_IPHONE_AQUA #else
449 
450 // this seems to be ok on AMD drivers under Linux
451 static const char* collide_non_walk_kernel_headers = " \
452 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable \n\
453 ";
454 // OLD_IPHONE_AQUA #endif // AQUA
455 
456 #endif
457 
458 
459 
460 static const char* collide_non_walk_kernel = " \
461  \n\
462 /********************************************************************************/ \n\
463 /* */ \n\
464 /* Collide kernel for fly and examine modes */ \n\
465 /* */ \n\
466 /********************************************************************************/ \n\
467 /* Function prototypes */ \n\
468 float4 closest_point_on_plane(float4 point_a, float4 point_b, float4 point_c); \n\
469  \n\
470 /* start the collide process. \n\
471  \n\
472 1) transform the vertex. \n\
473 2) calculate normal \n\
474 3) if triangle is visible to us, get ready for collide calcs \n\
475  \n\
476 */ \n\
477  \n\
478  \n\
479 #define DOUGS_FLOAT_TOLERANCE 0.00000001 \n\
480 #define FLOAT_TOLERANCE 0.0000001f \n\
481 #define PR_DOUBLESIDED 0x01 \n\
482 #define PR_FRONTFACING 0x02 /* overrides effect of doublesided. */ \n\
483 #define PR_BACKFACING 0x04 /* overrides effect of doublesided, all normals are reversed. */ \n\
484  \n\
485 /********************************************************************************/ \n\
486  \n\
487  \n\
488 #define APPROX (a, b) (fabs(a-b) < FLOAT_TOLERANCE) \n\
489 #define VECSCALE(v,s) (float4)(v.x*s, v.y*s, v.z*s, 0.0f) \n\
490 #define VECLENGTH(v) (float)sqrt((float)dot((float4)v,(float4)v)) \n\
491  \n\
492  \n\
493  \n\
494 /********************************************************************************/ \n\
495 /* */ \n\
496 /* Three vertices; find the closest one which intersects the Z plane; */ \n\
497 /* either we choose a Vertex, on an edge, or fabricate one in the */ \n\
498 /* middle of the triangle somewhere. */ \n\
499 /* */ \n\
500 /* Adapted from \"Real time Collision Detection\", Christer Ericson. */ \n\
501 /* */ \n\
502 /********************************************************************************/ \n\
503  \n\
504  \n\
505 float4 closest_point_on_plane(float4 point_a, float4 point_b, float4 point_c) { \n\
506  float4 vector_ab = (point_b - point_a); // b - a \n\
507  float4 vector_ac = (point_c - point_a); // c - a \n\
508  float4 vector_bc = (point_c - point_b); // c - b \n\
509  float4 vector_ba = (point_a - point_b); // a - b \n\
510  float4 vector_ca = (point_a - point_c); // a - c \n\
511  float4 vector_cb = (point_b - point_c); // b - c \n\
512  \n\
513  \n\
514  // we have moved points, so our bounding sphere is at (0,0,0) so p = (0,0,0) \n\
515  float4 vector_ap = point_a * (float4)(-1.0f, -1.0f, -1.0f, -1.0f); // p - a \n\
516  float4 vector_bp = point_b * (float4)(-1.0f, -1.0f, -1.0f, -1.0f); // p - b \n\
517  float4 vector_cp = point_c * (float4)(-1.0f, -1.0f, -1.0f, -1.0f); // p - c \n\
518  #define vector_pa point_a /* a - p */ \n\
519  #define vector_pb point_b /* b - p */ \n\
520  #define vector_pc point_c /* c - p */ \n\
521  \n\
522  // Step 2. Compute parametric position s for projection P' of P on AB, \n\
523  // P' = A + s*AB, s = snom/(snom+sdenom) \n\
524  \n\
525  float snom = dot(vector_ap, vector_ab); // (p - a, ab); \n\
526  float sdenom = dot(vector_bp, vector_ba); // (p - b, a - b); \n\
527  \n\
528  // Step 3. \n\
529  // Compute parametric position t for projection P' of P on AC, \n\
530  // P' = A + t*AC, s = tnom/(tnom+tdenom) \n\
531  float tnom = dot(vector_ap, vector_ac); // (p - a, ac); \n\
532  float tdenom = dot(vector_cp, vector_ca); // (p - c, a - c); \n\
533  \n\
534  // Step 4. \n\
535  if (snom <= 0.0f && tnom <= 0.0f) { \n\
536  return point_a; \n\
537  } \n\
538  \n\
539  // Step 5. \n\
540  // Compute parametric position u for projection P' of P on BC, \n\
541  // P' = B + u*BC, u = unom/(unom+udenom) \n\
542  float unom = dot(vector_bp, vector_bc); //(p - b, bc) \n\
543  float udenom = dot(vector_cp, vector_cb); // (p - c, b - c); \n\
544  \n\
545  // Step 6. \n\
546  if (sdenom <= 0.0f && unom <= 0.0f) { \n\
547  return point_b; \n\
548  } \n\
549  \n\
550  if (tdenom <= 0.0f && udenom <= 0.0f) { \n\
551  return point_c; \n\
552  } \n\
553  \n\
554  \n\
555  // Step 7. \n\
556  // P is outside (or on) AB if the triple scalar product [N PA PB] <= 0 \n\
557  float4 n; \n\
558  float4 tmp; \n\
559  float vc; \n\
560  \n\
561  n = cross(vector_ab, vector_ac); // (b - a, c - a); \n\
562  tmp = cross(vector_pa, vector_pb); // veccross (a-p, b-p); \n\
563  \n\
564  // vc = dot(n, veccross(a - p, b - p)); \n\
565  vc = dot(n, tmp); \n\
566  \n\
567  \n\
568  // If P outside AB and within feature region of AB, \n\
569  // return projection of P onto AB \n\
570  if (vc <= 0.0f && snom >= 0.0f && sdenom >= 0.0f) { \n\
571  return point_a + snom / (snom + sdenom) * vector_ab; \n\
572  } \n\
573  \n\
574  \n\
575  \n\
576  // Step 8. \n\
577  // P is outside (or on) BC if the triple scalar product [N PB PC] <= 0 \n\
578  tmp = cross (vector_pb, vector_pc); \n\
579  \n\
580  float va = dot(n, tmp); // Cross(b - p, c - p)); \n\
581  \n\
582  // If P outside BC and within feature region of BC, \n\
583  // return projection of P onto BC \n\
584  if (va <= 0.0f && unom >= 0.0f && udenom >= 0.0f) { \n\
585  return point_b + unom / (unom + udenom) * vector_bc; \n\
586  } \n\
587  \n\
588  // Step 9. \n\
589  // P is outside (or on) CA if the triple scalar product [N PC PA] <= 0 \n\
590  tmp = cross (vector_pc, vector_pa); \n\
591  \n\
592  float vb = dot(n, tmp); // Cross(c - p, a - p)); \n\
593  // If P outside CA and within feature region of CA, \n\
594  // return projection of P onto CA \n\
595  if (vb <= 0.0f && tnom >= 0.0f && tdenom >= 0.0f) { \n\
596  return point_a + tnom / (tnom + tdenom) * vector_ac; \n\
597  } \n\
598  \n\
599  // 10. \n\
600  // P must project inside face region. Compute Q using barycentric coordinates \n\
601  float u = va / (va + vb + vc); \n\
602  float v = vb / (va + vb + vc); \n\
603  float w = 1.0f - u - v; // = vc / (va + vb + vc) \n\
604  float4 u4 = (float4)(u); \n\
605  float4 v4 = (float4)(v); \n\
606  float4 w4 = (float4)(w); \n\
607  \n\
608  //return u * point_a + v * point_b + w * point_c; \n\
609  float4 rv = mad(point_a,u4,mad(point_b,v4,point_c*w4)); \n\
610  return rv; \n\
611 } \n\
612  \n\
613 /********************************************************************************/ \n\
614  \n\
615  __kernel void compute_collide ( \n\
616  __global float4 *output, /* 0 */ \n\
617  const unsigned int count, /* 1 */ \n\
618  __global float *mymat, /* 2 */ \n\
619  __global float *my_vertex, /* 3 */ \n\
620  __global short *my_cindex, /* 4 */ \n\
621  const int face_ccw, /* 5 */ \n\
622  const int face_flags, /* 6 */ \n\
623  const float avatar_radius, /* 7 */ \n\
624  const int ntri /* 8 */ \n\
625  ) { \n\
626  \n\
627  /* which index this instantation is working on */ \n\
628  int i_am_canadian = get_global_id(0); \n\
629  if (i_am_canadian >= ntri) return; /* allows for workgroup size sizes */ \n\
630  \n\
631  /* vertices for this triangle */ \n\
632  /* transformed by matrix */ \n\
633  float4 tv1; \n\
634  float4 tv2; \n\
635  float4 tv3; \n\
636  \n\
637  /* starting index in my_vertex of this vertex */ \n\
638  /* we work in triangles; each triangle has 3 vertices */ \n\
639  #define COORD_1 (my_cindex[i_am_canadian*3+0]*3) \n\
640  #define COORD_2 (my_cindex[i_am_canadian*3+1]*3) \n\
641  #define COORD_3 (my_cindex[i_am_canadian*3+2]*3) \n\
642  \n\
643  /* do matrix transform, 4 floats wide. */ \n\
644  float4 matColumn1 = (float4)(convert_float(mymat[0]),convert_float(mymat[1]),convert_float(mymat[2]),0.0f); \n\
645  float4 matColumn2 = (float4)(convert_float(mymat[4]),convert_float(mymat[5]),convert_float(mymat[6]),0.0f); \n\
646  float4 matColumn3 = (float4)(convert_float(mymat[8]),convert_float(mymat[9]),convert_float(mymat[10]),0.0f); \n\
647  float4 matColumn4 = (float4)(convert_float(mymat[12]),convert_float(mymat[13]),convert_float(mymat[14]),0.0f); \n\
648  \n\
649  /* first vertex */ \n\
650  float4 Vertex_X = (float4)(my_vertex[COORD_1+0]); \n\
651  float4 Vertex_Y = (float4)(my_vertex[COORD_1+1]); \n\
652  float4 Vertex_Z = (float4)(my_vertex[COORD_1+2]); \n\
653  tv1 = mad(matColumn1,Vertex_X,mad(matColumn2,Vertex_Y,mad(matColumn3,Vertex_Z,matColumn4))); \n\
654  \n\
655  /* second vertex */ \n\
656  Vertex_X = (float4)(my_vertex[COORD_2+0]); \n\
657  Vertex_Y = (float4)(my_vertex[COORD_2+1]); \n\
658  Vertex_Z = (float4)(my_vertex[COORD_2+2]); \n\
659  tv2 = mad(matColumn1,Vertex_X,mad(matColumn2,Vertex_Y,mad(matColumn3,Vertex_Z,matColumn4))); \n\
660  \n\
661  /* third vertex */ \n\
662  Vertex_X = (float4)(my_vertex[COORD_3+0]); \n\
663  Vertex_Y = (float4)(my_vertex[COORD_3+1]); \n\
664  Vertex_Z = (float4)(my_vertex[COORD_3+2]); \n\
665  tv3 = mad(matColumn1,Vertex_X,mad(matColumn2,Vertex_Y,mad(matColumn3,Vertex_Z,matColumn4))); \n\
666  \n\
667  \n\
668  /* calculate normal for face from transformed vertices */ \n\
669  /* this replicates polynormalf for opencl */ \n\
670  #define VEC_DIST_1 (tv2-tv1) \n\
671  #define VEC_DIST_2 (tv3-tv1) \n\
672  float4 norm = normalize(cross(VEC_DIST_1,VEC_DIST_2)); \n\
673  \n\
674  /* from polyrep_disp_rec2, see that function for full comments */ \n\
675  bool frontfacing; \n\
676  \n\
677  /* how we view it from the avatar */ \n\
678  if (face_ccw) frontfacing = (dot(norm,tv1) < 0); \n\
679  else frontfacing = (dot(norm,tv1) >= 0); \n\
680  \n\
681  /* now, is solid false, or ccw or ccw winded triangle? */ \n\
682  /* if we should do this triangle, the if statement is true */ \n\
683  \n\
684  bool should_do_this_triangle = \n\
685  ((frontfacing && !(face_flags & PR_DOUBLESIDED) ) \n\
686  || ( (face_flags & PR_DOUBLESIDED) && !(face_flags & (PR_FRONTFACING | PR_BACKFACING) ) ) \n\
687  || (frontfacing && (face_flags & PR_FRONTFACING)) \n\
688  || (!frontfacing && (face_flags & PR_BACKFACING)) ); \n\
689  \n\
690  \n\
691  if (!should_do_this_triangle) { \n\
692  output[i_am_canadian] = (float4)(0.0f,0.0f,0.0f,0.0f); \n\
693 \
694  return; \n\
695  } \n\
696  \n\
697  \n\
698  /* if we are down to here, we have to do this triangle */ \n\
699  \n\
700  if(!frontfacing) { /*can only be here in DoubleSided mode*/ \n\
701  /*reverse polygon orientation, and do calculations*/ \n\
702  norm = VECSCALE(norm,-1.0f); \n\
703  } \n\
704  \n\
705  /********************************************************************************/ \n\
706  /* */ \n\
707  /* Collide Kernel Step 2: do hit calculations */ \n\
708  /* replicate Dougs get_poly_min_disp_with_sphere function */ \n\
709  /* */ \n\
710  /********************************************************************************/ \n\
711  \n\
712  float4 closest_point = closest_point_on_plane(tv1,tv2,tv3); \n\
713  \n\
714  float get_poly_mindisp = dot(closest_point,closest_point); \n\
715  \n\
716  if (get_poly_mindisp > (avatar_radius * avatar_radius)) { \n\
717  output[i_am_canadian] = (float4)(0.0f,0.0f,0.0f,0.0f); \n\
718  return; \n\
719  } \n\
720  \n\
721  /* do we have a movement here? */ \n\
722  if (VECLENGTH(closest_point) > FLOAT_TOLERANCE) { \n\
723  float poly_min_rt = sqrt(get_poly_mindisp); \n\
724  float sFactor = (avatar_radius -poly_min_rt) /VECLENGTH(closest_point); \n\
725  \n\
726  float4 result = VECSCALE(closest_point,sFactor); \n\
727  /* copy over the result */ \n\
728  result.w = 100.0f; /* flag that this is a good one */ \n\
729  output[i_am_canadian] = result; \n\
730  return; \n\
731  } \n\
732  \n\
733  \n\
734  /* if we are down to here, we can just return zero */ \n\
735  output[i_am_canadian] = (float4)(0.0f,0.0f,0.0f,0.0f); \n\
736 } \n\
737 ";
738 
739 #endif //HAVE_OPENCL