From 48231196f7ba9ffd80831641f74df302d5d52383 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 6 Jul 2011 23:08:26 -0400
Subject: [PATCH] =?UTF-8?q?Reduction=20fixes=20for=20RV770=20by=20Ricardo?=
 =?UTF-8?q?=20Am=C3=A9zquita.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyopencl/characterize.py | 11 +++++++++++
 pyopencl/reduction.py    | 21 ++++++++++++++-------
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/pyopencl/characterize.py b/pyopencl/characterize.py
index 469f05ae..a3b4c608 100644
--- a/pyopencl/characterize.py
+++ b/pyopencl/characterize.py
@@ -11,6 +11,17 @@ def has_double_support(dev):
 
 
 
+def has_amd_double_support(dev):
+    """"Fix to allow incomplete amd double support in low end boards"""
+
+    for ext in dev.extensions.split(" "):
+        if ext == "cl_amd_fp64":
+            return True
+    return False
+
+
+
+
 def reasonable_work_group_size_multiple(dev, ctx=None):
     try:
         return dev.warp_size_nv
diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py
index 31e29b93..ea5adcc2 100644
--- a/pyopencl/reduction.py
+++ b/pyopencl/reduction.py
@@ -37,7 +37,6 @@ import pyopencl as cl
 from pyopencl.tools import (
         context_dependent_memoize,
         dtype_to_ctype)
-from pytools import memoize_method
 import numpy as np
 import pyopencl._mymako as mako
 
@@ -45,14 +44,18 @@ import pyopencl._mymako as mako
 
 
 KERNEL = """
+
     #define GROUP_SIZE ${group_size}
     #define READ_AND_MAP(i) (${map_expr})
     #define REDUCE(a, b) (${reduce_expr})
 
     % if double_support:
         #pragma OPENCL EXTENSION cl_khr_fp64: enable
+    % elif amd_double_support:
+        #pragma OPENCL EXTENSION cl_amd_fp64: enable
     % endif
 
+
     typedef ${out_type} out_type;
 
     ${preamble}
@@ -149,13 +152,16 @@ def  get_reduction_source(
     # {{{ compute group size
 
     def get_dev_group_size(device):
+        # dirty fix for the RV770 boards
+        max_work_group_size=device.max_work_group_size
+        if "RV770" in device.name:
+            max_work_group_size=64
         return min(
-                device.max_work_group_size,
+                max_work_group_size,
                 (device.local_mem_size + out_type_size - 1)
                 // out_type_size)
 
-    group_size = min(
-            get_dev_group_size(dev) for dev in devices)
+    group_size = min(get_dev_group_size(dev) for dev in devices)
 
     if max_group_size is not None:
         group_size = min(max_group_size, group_size)
@@ -180,7 +186,7 @@ def  get_reduction_source(
 
     from mako.template import Template
     from pytools import all
-    from pyopencl.characterize import has_double_support
+    from pyopencl.characterize import has_double_support, has_amd_double_support
     src = str(Template(KERNEL).render(
         out_type=out_type,
         arguments=arguments,
@@ -192,7 +198,9 @@ def  get_reduction_source(
         name=name,
         preamble=preamble,
         double_support=all(
-            has_double_support(dev) for dev in devices)
+            has_double_support(dev) for dev in devices),
+        amd_double_support=all(
+            has_amd_double_support(dev) for dev in devices)
         ))
 
     from pytools import Record
@@ -326,7 +334,6 @@ class ReductionKernel:
                         (group_count,), self.dtype_out,
                         allocator=repr_vec.allocator)
 
-            #print group_count, seq_count, stage_inf.group_size
             stage_inf.kernel(
                     use_queue,
                     (group_count*stage_inf.group_size,),
-- 
GitLab